From c5b34c2b24e83a7df4f71419fa50397b661250d4 Mon Sep 17 00:00:00 2001 From: Justin Williams Date: Wed, 29 Oct 2025 13:52:30 -0400 Subject: [PATCH 1/2] Modify ServerEvents to match new responses from Realtime API Remove model from connect API and rely on configuration changes or initial client_secrets API call to configure Add conversationItemAdded server event and use previousItemId to place entries in correct position --- Sources/Core/Models/ServerEvent.swift | 10 +++++-- Sources/Core/Models/Session.swift | 15 +++++----- Sources/UI/Conversation.swift | 28 +++++++++++++++++-- .../Extensions/RealtimeAPI+WebRTC.swift | 4 +-- .../WebRTC/Extensions/URLRequest+WebRTC.swift | 6 ++-- Sources/WebRTC/WebRTCConnector.swift | 1 + 6 files changed, 46 insertions(+), 18 deletions(-) diff --git a/Sources/Core/Models/ServerEvent.swift b/Sources/Core/Models/ServerEvent.swift index c8ff0af..949c449 100644 --- a/Sources/Core/Models/ServerEvent.swift +++ b/Sources/Core/Models/ServerEvent.swift @@ -22,6 +22,11 @@ import MetaCodable public var token: String } + public struct Usage: Equatable, Hashable, Codable, Sendable { + let type: String + let seconds: Int + } + /// Returned when an error occurs. /// - Parameter eventId: The unique ID of the server event. /// - Parameter error: Details of the error. @@ -90,7 +95,7 @@ import MetaCodable contentIndex: Int, transcript: String, logprobs: [LogProb]?, - usage: Response.Usage + usage: Usage ) /// Returned when the text value of an input audio transcription content part is updated. @@ -107,6 +112,7 @@ import MetaCodable itemId: String, contentIndex: Int, delta: String, + obfuscation: String, logprobs: [LogProb]? ) @@ -536,7 +542,7 @@ extension ServerEvent: Identifiable { case let .conversationItemDone(id, _, _): id case let .conversationItemRetrieved(id, _): id case let .conversationItemInputAudioTranscriptionCompleted(id, _, _, _, _, _): id - case let .conversationItemInputAudioTranscriptionDelta(id, _, _, _, _): id + case let .conversationItemInputAudioTranscriptionDelta(id, _, _, _, _, _): id case let .conversationItemInputAudioTranscriptionSegment(id, _, _, _, _, _, _, _): id case let .conversationItemInputAudioTranscriptionFailed(id, _, _, _): id case let .conversationItemTruncated(id, _, _, _): id diff --git a/Sources/Core/Models/Session.swift b/Sources/Core/Models/Session.swift index e7ab4bc..4abfaf4 100644 --- a/Sources/Core/Models/Session.swift +++ b/Sources/Core/Models/Session.swift @@ -95,7 +95,7 @@ import HelperCoders } /// Configuration for turn detection - public struct TurnDetection: Codable, Equatable, Hashable, Sendable { + @Codable public struct TurnDetection: Equatable, Hashable, Sendable { /// The type of turn detection. public enum VAD: String, Codable, Equatable, Hashable, Sendable { case server = "server_vad" @@ -108,6 +108,7 @@ import HelperCoders } /// Whether or not to automatically generate a response when a VAD stop event occurs. + @Default(ifMissing: false) public var createResponse: Bool /// Used only for `semantic` mode. The eagerness of the model to respond. @@ -262,13 +263,13 @@ import HelperCoders public var input: Input /// Configuration for output audio. - public var output: Output + public var output: Output? /// Creates a new `Audio` configuration. /// /// - Parameter input: Configuration for input audio. - /// - Parameter output: Configuration for output audio. - public init(input: Input, output: Output) { + /// - Parameter output: Configuration for output audio. Output is nil when this is a transcription session + public init(input: Input, output: Output?) { self.input = input self.output = output } @@ -289,8 +290,8 @@ import HelperCoders /// /// The model can be instructed on response content and format, (e.g. "be extremely succinct", "act friendly", "here are examples of good responses") and on audio behavior (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). /// - /// The instructions are not guaranteed to be followed by the model, but they provide guidance to the model on the desired behavior. - public var instructions: String + /// The instructions are not guaranteed to be followed by the model, but they provide guidance to the model on the desired behavior. Instructions are nil in the event of a Transcription session + public var instructions: String? /// Maximum number of output tokens for a single assistant response, inclusive of tool calls. /// @@ -301,7 +302,7 @@ import HelperCoders public var modalities: [Modality]? /// The Realtime model used for this session. - public var model: Model + public var model: Model? /// Reference to a prompt template and its variables. public var prompt: Prompt? diff --git a/Sources/UI/Conversation.swift b/Sources/UI/Conversation.swift index 5820057..3d3b514 100644 --- a/Sources/UI/Conversation.swift +++ b/Sources/UI/Conversation.swift @@ -93,9 +93,9 @@ public final class Conversation: @unchecked Sendable { try await client.connect(using: request) } - public func connect(ephemeralKey: String, model: Model = .gptRealtime) async throws { + public func connect(ephemeralKey: String) async throws { do { - try await connect(using: .webRTCConnectionRequest(ephemeralKey: ephemeralKey, model: model)) + try await connect(using: .webRTCConnectionRequest(ephemeralKey: ephemeralKey)) } catch let error as WebRTCConnector.WebRTCError { guard case .invalidEphemeralKey = error else { throw error } throw ConversationError.invalidEphemeralKey @@ -176,10 +176,32 @@ private extension Conversation { if let sessionUpdateCallback { try updateSession(withChanges: sessionUpdateCallback) } case let .sessionUpdated(_, session): self.session = session - case let .conversationItemCreated(_, item, _): + case let .conversationItemAdded(_, item, nil): + entries.append(item) + case let .conversationItemAdded(_, item, previousItemId?): + if let entryIndex = entries.firstIndex(where: { $0.id == previousItemId }) { + entries.insert(item, at: entryIndex + 1) + } else { + entries.append(item) + } + case let .conversationItemCreated(_, item, nil): entries.append(item) + case let .conversationItemCreated(_, item, previousItemId?): + if let entryIndex = entries.firstIndex(where: { $0.id == previousItemId }) { + entries.insert(item, at: entryIndex + 1) + } else { + entries.append(item) + } case let .conversationItemDeleted(_, itemId): entries.removeAll { $0.id == itemId } + case let .conversationItemInputAudioTranscriptionDelta(_, itemId, contentIndex, delta, _, _): + updateEvent(id: itemId) { message in + guard case let .inputAudio(audio) = message.content[contentIndex] else { return } + + message.content[contentIndex] = .inputAudio( + .init(audio: audio.audio, transcript: audio.transcript ?? "" + delta) + ) + } case let .conversationItemInputAudioTranscriptionCompleted(_, itemId, contentIndex, transcript, _, _): updateEvent(id: itemId) { message in guard case let .inputAudio(audio) = message.content[contentIndex] else { return } diff --git a/Sources/WebRTC/Extensions/RealtimeAPI+WebRTC.swift b/Sources/WebRTC/Extensions/RealtimeAPI+WebRTC.swift index ae5f3cd..a636ccb 100644 --- a/Sources/WebRTC/Extensions/RealtimeAPI+WebRTC.swift +++ b/Sources/WebRTC/Extensions/RealtimeAPI+WebRTC.swift @@ -11,7 +11,7 @@ public extension RealtimeAPI { } /// Connect to the OpenAI WebRTC Realtime API with the given authentication token and model. - static func webRTC(ephemeralKey: String, model: Model = .gptRealtime) async throws -> RealtimeAPI { - return try await webRTC(connectingTo: .webRTCConnectionRequest(ephemeralKey: ephemeralKey, model: model)) + static func webRTC(ephemeralKey: String) async throws -> RealtimeAPI { + return try await webRTC(connectingTo: .webRTCConnectionRequest(ephemeralKey: ephemeralKey)) } } diff --git a/Sources/WebRTC/Extensions/URLRequest+WebRTC.swift b/Sources/WebRTC/Extensions/URLRequest+WebRTC.swift index 1335ea0..7fe9ed5 100644 --- a/Sources/WebRTC/Extensions/URLRequest+WebRTC.swift +++ b/Sources/WebRTC/Extensions/URLRequest+WebRTC.swift @@ -7,10 +7,8 @@ import FoundationNetworking fileprivate let baseURL = URL(string: "https://api.openai.com/v1/realtime/calls")! package extension URLRequest { - static func webRTCConnectionRequest(ephemeralKey: String, model: Model) -> URLRequest { - var request = URLRequest(url: baseURL.appending(queryItems: [ - URLQueryItem(name: "model", value: model.rawValue), - ])) + static func webRTCConnectionRequest(ephemeralKey: String) -> URLRequest { + var request = URLRequest(url: baseURL) request.httpMethod = "POST" request.setValue("Bearer \(ephemeralKey)", forHTTPHeaderField: "Authorization") diff --git a/Sources/WebRTC/WebRTCConnector.swift b/Sources/WebRTC/WebRTCConnector.swift index 3aa29df..35d8576 100644 --- a/Sources/WebRTC/WebRTCConnector.swift +++ b/Sources/WebRTC/WebRTCConnector.swift @@ -196,6 +196,7 @@ extension WebRTCConnector: LKRTCDataChannelDelegate { do { try stream.yield(decoder.decode(ServerEvent.self, from: buffer.data)) } catch { print("Failed to decode server event: \(String(data: buffer.data, encoding: .utf8) ?? "")") + print("Error: \(error)") stream.finish(throwing: error) } } From ab461b8bc0d263ff446da6e2676df176d1a35c2f Mon Sep 17 00:00:00 2001 From: Justin Williams Date: Wed, 29 Oct 2025 15:27:11 -0400 Subject: [PATCH 2/2] Remove hard-coded type parameter on Session. Can be realtime or transcription --- Sources/Core/Models/Session.swift | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Sources/Core/Models/Session.swift b/Sources/Core/Models/Session.swift index 4abfaf4..f0b9e34 100644 --- a/Sources/Core/Models/Session.swift +++ b/Sources/Core/Models/Session.swift @@ -276,7 +276,8 @@ import HelperCoders } /// The type of session to create. - public let type: String = "realtime" + /// Valid values are "realtime" or "transcription" + public let type: String /// Unique identifier for the session public var id: String? @@ -318,8 +319,9 @@ import HelperCoders /// Tools available to the model. public var tools: [Tool]? - public init(id: String? = nil, audio: Audio, instructions: String, maxResponseOutputTokens: MaxResponseOutputTokens? = nil, modalities: [Modality]? = nil, model: Model, prompt: Prompt? = nil, temperature: Double? = nil, toolChoice: Tool.Choice? = nil, tools: [Tool]? = nil) { + public init(id: String? = nil, type: String, audio: Audio, instructions: String, maxResponseOutputTokens: MaxResponseOutputTokens? = nil, modalities: [Modality]? = nil, model: Model, prompt: Prompt? = nil, temperature: Double? = nil, toolChoice: Tool.Choice? = nil, tools: [Tool]? = nil) { self.id = id + self.type = type self.tools = tools self.model = model self.audio = audio