From ca94fa2da2489f44009c805936ffd25eceb719dc Mon Sep 17 00:00:00 2001 From: Bhargav Annem Date: Mon, 10 Nov 2025 01:32:16 -0800 Subject: [PATCH 1/3] feat: input_image support --- Sources/Core/Models/Item.swift | 13 +++++++++++-- Sources/UI/Conversation.swift | 19 +++++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/Sources/Core/Models/Item.swift b/Sources/Core/Models/Item.swift index 4f24a58..8ccea78 100644 --- a/Sources/Core/Models/Item.swift +++ b/Sources/Core/Models/Item.swift @@ -37,12 +37,14 @@ import MetaCodable case text(String) case audio(Audio) case inputText(String) + case inputImage(String) case inputAudio(Audio) public var text: String? { switch self { case let .text(text): text case let .inputText(text): text + case let .inputImage(image): image case let .audio(audio): audio.transcript case let .inputAudio(audio): audio.transcript } @@ -419,6 +421,7 @@ extension Item.Message.Content: Codable { case text case audio case transcript + case image_url } private struct Text: Codable { @@ -440,7 +443,10 @@ extension Item.Message.Content: Codable { case "input_text": let container = try decoder.container(keyedBy: Text.CodingKeys.self) self = try .inputText(container.decode(String.self, forKey: .text)) - case "output_audio": + case "input_image": + let inner = try decoder.container(keyedBy: CodingKeys.self) + self = try .inputImage(inner.decodeIfPresent(String.self, forKey: .image_url) ?? "") + case "output_audio": self = try .audio(Item.Audio(from: decoder)) case "input_audio": self = try .inputAudio(Item.Audio(from: decoder)) @@ -459,7 +465,10 @@ extension Item.Message.Content: Codable { case let .inputText(text): try container.encode(text, forKey: .text) try container.encode("input_text", forKey: .type) - case let .audio(audio): + case let .inputImage(imageURL): + try container.encode(imageURL, forKey: .image_url) + try container.encode("input_image", forKey: .type) + case let .audio(audio): try container.encode("output_audio", forKey: .type) try container.encode(audio.audio, forKey: .audio) try container.encode(audio.transcript, forKey: .transcript) diff --git a/Sources/UI/Conversation.swift b/Sources/UI/Conversation.swift index 5820057..bfcb39c 100644 --- a/Sources/UI/Conversation.swift +++ b/Sources/UI/Conversation.swift @@ -13,7 +13,7 @@ public enum ConversationError: Error { public final class Conversation: @unchecked Sendable { public typealias SessionUpdateCallback = (inout Session) -> Void - private let client: WebRTCConnector + private let client: WebRTCConnector private var task: Task! private let sessionUpdateCallback: SessionUpdateCallback? private let errorStream: AsyncStream.Continuation @@ -156,7 +156,22 @@ public final class Conversation: @unchecked Sendable { try send(event: .createResponse(using: response)) } - /// Send the response of a function call. + /// Send an image + text message and wait for a response. + public func send(from role: Item.Message.Role, text: String, image: Data, response: Response.Config? = nil) throws { + let dataURI = "data:image/jpeg;base64,\(image.base64EncodedString())" + let message = Item.Message( + id: String(randomLength: 32), + role: role, + content: [ + .inputText(text), + .inputImage(dataURI) + ] + ) + try send(event: .createConversationItem(.message(message))) + try send(event: .createResponse(using: response)) + } + + /// Send the response of a function call. public func send(result output: Item.FunctionCallOutput) throws { try send(event: .createConversationItem(.functionCallOutput(output))) } From 6d6f8410e1fa74230682907ef6cda41ec73b4a2a Mon Sep 17 00:00:00 2001 From: Bhargav Annem Date: Mon, 10 Nov 2025 21:53:52 -0800 Subject: [PATCH 2/3] fix: send only image, no promtp --- Sources/UI/Conversation.swift | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Sources/UI/Conversation.swift b/Sources/UI/Conversation.swift index bfcb39c..0bca326 100644 --- a/Sources/UI/Conversation.swift +++ b/Sources/UI/Conversation.swift @@ -157,15 +157,12 @@ public final class Conversation: @unchecked Sendable { } /// Send an image + text message and wait for a response. - public func send(from role: Item.Message.Role, text: String, image: Data, response: Response.Config? = nil) throws { + public func send(from role: Item.Message.Role, image: Data, response: Response.Config? = nil) throws { let dataURI = "data:image/jpeg;base64,\(image.base64EncodedString())" let message = Item.Message( id: String(randomLength: 32), role: role, - content: [ - .inputText(text), - .inputImage(dataURI) - ] + content: [.inputImage(dataURI)] ) try send(event: .createConversationItem(.message(message))) try send(event: .createResponse(using: response)) From 9b5bcde8a61a1c5e94a4f9c7b311b970890cab64 Mon Sep 17 00:00:00 2001 From: Bhargav Annem Date: Mon, 17 Nov 2025 17:08:25 -0800 Subject: [PATCH 3/3] feat: cleaning --- Sources/UI/Conversation.swift | 9 ++++++--- Sources/UI/Extensions/String+random.swift | 8 -------- 2 files changed, 6 insertions(+), 11 deletions(-) delete mode 100644 Sources/UI/Extensions/String+random.swift diff --git a/Sources/UI/Conversation.swift b/Sources/UI/Conversation.swift index 0bca326..dd46126 100644 --- a/Sources/UI/Conversation.swift +++ b/Sources/UI/Conversation.swift @@ -152,15 +152,17 @@ public final class Conversation: @unchecked Sendable { /// Send a text message and wait for a response. /// Optionally, you can provide a response configuration to customize the model's behavior. public func send(from role: Item.Message.Role, text: String, response: Response.Config? = nil) throws { - try send(event: .createConversationItem(.message(Item.Message(id: String(randomLength: 32), role: role, content: [.inputText(text)])))) + let id = UUID().uuidString.replacingOccurrences(of: "-", with: "") // random 32 character string + try send(event: .createConversationItem(.message(Item.Message(id: id, role: role, content: [.inputText(text)])))) try send(event: .createResponse(using: response)) } /// Send an image + text message and wait for a response. public func send(from role: Item.Message.Role, image: Data, response: Response.Config? = nil) throws { let dataURI = "data:image/jpeg;base64,\(image.base64EncodedString())" + let id = UUID().uuidString.replacingOccurrences(of: "-", with: "") // random 32 character string let message = Item.Message( - id: String(randomLength: 32), + id: id, role: role, content: [.inputImage(dataURI)] ) @@ -188,7 +190,8 @@ private extension Conversation { if let sessionUpdateCallback { try updateSession(withChanges: sessionUpdateCallback) } case let .sessionUpdated(_, session): self.session = session - case let .conversationItemCreated(_, item, _): + case let .conversationItemCreated(_, item, _), + let .conversationItemAdded(_, item, _): entries.append(item) case let .conversationItemDeleted(_, itemId): entries.removeAll { $0.id == itemId } diff --git a/Sources/UI/Extensions/String+random.swift b/Sources/UI/Extensions/String+random.swift deleted file mode 100644 index bad18c4..0000000 --- a/Sources/UI/Extensions/String+random.swift +++ /dev/null @@ -1,8 +0,0 @@ -import Foundation - -extension String { - init(randomLength length: Int) { - let letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" - self = String((0..