diff --git a/FirebaseAI/CHANGELOG.md b/FirebaseAI/CHANGELOG.md index 8c0a0068fde..be26755eceb 100644 --- a/FirebaseAI/CHANGELOG.md +++ b/FirebaseAI/CHANGELOG.md @@ -11,6 +11,12 @@ - [fixed] Fixed a decoding error when generating images with the `gemini-2.5-flash-image-preview` model using `generateContentStream` or `sendMessageStream` with the Gemini Developer API. (#15262) +- [feature] Added support for the Live API, which allows bidirectional + communication with the model in realtime. + + To get started with the Live API, see the Firebase docs on + [Bidirectional streaming using the Gemini Live API](https://firebase.google.com/docs/ai-logic/live-api). + (#15309) # 12.2.0 - [feature] Added support for returning thought summaries, which are synthesized diff --git a/FirebaseAI/Sources/AILog.swift b/FirebaseAI/Sources/AILog.swift index 03232ff23df..460f1f3aaa8 100644 --- a/FirebaseAI/Sources/AILog.swift +++ b/FirebaseAI/Sources/AILog.swift @@ -67,12 +67,25 @@ enum AILog { case executableCodeUnrecognizedLanguage = 3016 case fallbackValueUsed = 3017 case urlMetadataUnrecognizedURLRetrievalStatus = 3018 + case liveSessionUnsupportedMessage = 3019 + case liveSessionUnsupportedMessagePayload = 3020 + case liveSessionFailedToEncodeClientMessage = 3021 + case liveSessionFailedToEncodeClientMessagePayload = 3022 + case liveSessionFailedToSendClientMessage = 3023 + case liveSessionUnexpectedResponse = 3024 + case liveSessionGoingAwaySoon = 3025 + case decodedMissingProtoDurationSuffix = 3026 + case decodedInvalidProtoDurationString = 3027 + case decodedInvalidProtoDurationSeconds = 3028 + case decodedInvalidProtoDurationNanoseconds = 3029 // SDK State Errors case generateContentResponseNoCandidates = 4000 case generateContentResponseNoText = 4001 case appCheckTokenFetchFailed = 4002 case generateContentResponseEmptyCandidates = 4003 + case invalidWebsocketURL = 4004 + case duplicateLiveSessionSetupComplete = 4005 // SDK Debugging case loadRequestStreamResponseLine = 5000 diff --git a/FirebaseAI/Sources/FirebaseAI.swift b/FirebaseAI/Sources/FirebaseAI.swift index ecd9a92077e..fdd870ecfcf 100644 --- a/FirebaseAI/Sources/FirebaseAI.swift +++ b/FirebaseAI/Sources/FirebaseAI.swift @@ -137,6 +137,46 @@ public final class FirebaseAI: Sendable { ) } + /// **[Public Preview]** Initializes a ``LiveGenerativeModel`` with the given parameters. + /// + /// > Warning: Using the Firebase AI Logic SDKs with the Gemini Live API is in Public + /// Preview, which means that the feature is not subject to any SLA or deprecation policy and + /// could change in backwards-incompatible ways. + /// + /// > Important: Only models that support the Gemini Live API (typically containing `live-*` in + /// the name) are supported. + /// + /// - Parameters: + /// - modelName: The name of the model to use, for example + /// `"gemini-live-2.5-flash-preview"`; + /// see [model versions](https://firebase.google.com/docs/ai-logic/live-api?api=dev#models-that-support-capability) + /// for a list of supported models. + /// - generationConfig: The content generation parameters your model should use. + /// - tools: A list of ``Tool`` objects that the model may use to generate the next response. + /// - toolConfig: Tool configuration for any ``Tool`` specified in the request. + /// - systemInstruction: Instructions that direct the model to behave a certain way; currently + /// only text content is supported. + /// - requestOptions: Configuration parameters for sending requests to the backend. + @available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) + @available(watchOS, unavailable) + public func liveModel(modelName: String, + generationConfig: LiveGenerationConfig? = nil, + tools: [Tool]? = nil, + toolConfig: ToolConfig? = nil, + systemInstruction: ModelContent? = nil, + requestOptions: RequestOptions = RequestOptions()) -> LiveGenerativeModel { + return LiveGenerativeModel( + modelResourceName: modelResourceName(modelName: modelName), + firebaseInfo: firebaseInfo, + apiConfig: apiConfig, + generationConfig: generationConfig, + tools: tools, + toolConfig: toolConfig, + systemInstruction: systemInstruction, + requestOptions: requestOptions + ) + } + /// Class to enable FirebaseAI to register via the Objective-C based Firebase component system /// to include FirebaseAI in the userAgent. @objc(FIRVertexAIComponent) class FirebaseVertexAIComponent: NSObject {} diff --git a/FirebaseAI/Sources/GenerativeAIService.swift b/FirebaseAI/Sources/GenerativeAIService.swift index 8056d4172b8..a17364f8cb6 100644 --- a/FirebaseAI/Sources/GenerativeAIService.swift +++ b/FirebaseAI/Sources/GenerativeAIService.swift @@ -177,7 +177,10 @@ struct GenerativeAIService { urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type") if let appCheck = firebaseInfo.appCheck { - let tokenResult = try await fetchAppCheckToken(appCheck: appCheck) + let tokenResult = try await appCheck.fetchAppCheckToken( + limitedUse: firebaseInfo.useLimitedUseAppCheckTokens, + domain: "GenerativeAIService" + ) urlRequest.setValue(tokenResult.token, forHTTPHeaderField: "X-Firebase-AppCheck") if let error = tokenResult.error { AILog.error( @@ -207,53 +210,6 @@ struct GenerativeAIService { return urlRequest } - private func fetchAppCheckToken(appCheck: AppCheckInterop) async throws - -> FIRAppCheckTokenResultInterop { - if firebaseInfo.useLimitedUseAppCheckTokens { - if let token = await getLimitedUseAppCheckToken(appCheck: appCheck) { - return token - } - - let errorMessage = - "The provided App Check token provider doesn't implement getLimitedUseToken(), but requireLimitedUseTokens was enabled." - - #if Debug - fatalError(errorMessage) - #else - throw NSError( - domain: "\(Constants.baseErrorDomain).\(Self.self)", - code: AILog.MessageCode.appCheckTokenFetchFailed.rawValue, - userInfo: [NSLocalizedDescriptionKey: errorMessage] - ) - #endif - } - - return await appCheck.getToken(forcingRefresh: false) - } - - private func getLimitedUseAppCheckToken(appCheck: AppCheckInterop) async - -> FIRAppCheckTokenResultInterop? { - // At the moment, `await` doesn’t get along with Objective-C’s optional protocol methods. - await withCheckedContinuation { (continuation: CheckedContinuation< - FIRAppCheckTokenResultInterop?, - Never - >) in - guard - firebaseInfo.useLimitedUseAppCheckTokens, - // `getLimitedUseToken(completion:)` is an optional protocol method. Optional binding - // is performed to make sure `continuation` is called even if the method’s not implemented. - let limitedUseTokenClosure = appCheck.getLimitedUseToken - else { - return continuation.resume(returning: nil) - } - - limitedUseTokenClosure { tokenResult in - // The placeholder token should be used in the case of App Check error. - continuation.resume(returning: tokenResult) - } - } - } - private func httpResponse(urlResponse: URLResponse) throws -> HTTPURLResponse { // The following condition should always be true: "Whenever you make HTTP URL load requests, any // response objects you get back from the URLSession, NSURLConnection, or NSURLDownload class diff --git a/FirebaseAI/Sources/Types/Internal/APIConfig.swift b/FirebaseAI/Sources/Types/Internal/APIConfig.swift index f9c5d32c779..e854db25c8c 100644 --- a/FirebaseAI/Sources/Types/Internal/APIConfig.swift +++ b/FirebaseAI/Sources/Types/Internal/APIConfig.swift @@ -68,6 +68,7 @@ extension APIConfig { extension APIConfig.Service { /// Network addresses for generative AI API services. + // TODO: maybe remove the https:// prefix and just add it as needed? websockets use these too. enum Endpoint: String, Encodable { /// The Firebase proxy production endpoint. /// diff --git a/FirebaseAI/Sources/Types/Internal/AppCheck.swift b/FirebaseAI/Sources/Types/Internal/AppCheck.swift new file mode 100644 index 00000000000..3b6d784f636 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/AppCheck.swift @@ -0,0 +1,74 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import FirebaseAppCheckInterop + +/// Internal helper extension for fetching app check tokens. +/// +/// Provides a common means for fetching limited use tokens, and falling back to standard tokens +/// when it's disabled (or in debug mode). This also centrializes the error, since this method is +/// used in multiple places. +extension AppCheckInterop { + /// Fetch the appcheck token. + /// + /// - Parameters: + /// - limitedUse: Should the token be a limited-use token, or a standard token. + /// - domain: A string dictating where this method is being called from. Used in any thrown + /// errors, to avoid hard-to-parse traces. + func fetchAppCheckToken(limitedUse: Bool, + domain: String) async throws -> FIRAppCheckTokenResultInterop { + if limitedUse { + if let token = await getLimitedUseTokenAsync() { + return token + } + + let errorMessage = + "The provided App Check token provider doesn't implement getLimitedUseToken(), but requireLimitedUseTokens was enabled." + + #if Debug + fatalError(errorMessage) + #else + throw NSError( + domain: "\(Constants.baseErrorDomain).\(domain)", + code: AILog.MessageCode.appCheckTokenFetchFailed.rawValue, + userInfo: [NSLocalizedDescriptionKey: errorMessage] + ) + #endif + } + + return await getToken(forcingRefresh: false) + } + + private func getLimitedUseTokenAsync() async + -> FIRAppCheckTokenResultInterop? { + // At the moment, `await` doesn’t get along with Objective-C’s optional protocol methods. + await withCheckedContinuation { (continuation: CheckedContinuation< + FIRAppCheckTokenResultInterop?, + Never + >) in + guard + // `getLimitedUseToken(completion:)` is an optional protocol method. Optional binding + // is performed to make sure `continuation` is called even if the method’s not implemented. + let limitedUseTokenClosure = getLimitedUseToken + else { + return continuation.resume(returning: nil) + } + + limitedUseTokenClosure { tokenResult in + // The placeholder token should be used in the case of App Check error. + continuation.resume(returning: tokenResult) + } + } + } +} diff --git a/FirebaseAI/Sources/Types/Internal/InternalPart.swift b/FirebaseAI/Sources/Types/Internal/InternalPart.swift index a8afe4439c3..a9d5a2eb810 100644 --- a/FirebaseAI/Sources/Types/Internal/InternalPart.swift +++ b/FirebaseAI/Sources/Types/Internal/InternalPart.swift @@ -45,10 +45,12 @@ struct FileData: Codable, Equatable, Sendable { struct FunctionCall: Equatable, Sendable { let name: String let args: JSONObject + let id: String? - init(name: String, args: JSONObject) { + init(name: String, args: JSONObject, id: String?) { self.name = name self.args = args + self.id = id } } @@ -56,10 +58,12 @@ struct FunctionCall: Equatable, Sendable { struct FunctionResponse: Codable, Equatable, Sendable { let name: String let response: JSONObject + let id: String? - init(name: String, response: JSONObject) { + init(name: String, response: JSONObject, id: String? = nil) { self.name = name self.response = response + self.id = id } } @@ -135,6 +139,7 @@ extension FunctionCall: Codable { } else { args = JSONObject() } + id = try container.decodeIfPresent(String.self, forKey: .id) } } diff --git a/FirebaseAI/Sources/Types/Internal/Live/AsyncWebSocket.swift b/FirebaseAI/Sources/Types/Internal/Live/AsyncWebSocket.swift new file mode 100644 index 00000000000..81c1c337258 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/AsyncWebSocket.swift @@ -0,0 +1,149 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation +private import FirebaseCoreInternal + +/// Async API for interacting with web sockets. +/// +/// Internally, this just wraps around a `URLSessionWebSocketTask`, and provides a more async +/// friendly interface for sending and consuming data from it. +/// +/// Also surfaces a more fine-grained ``WebSocketClosedError`` for when the web socket is closed. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +final class AsyncWebSocket: Sendable { + private let webSocketTask: URLSessionWebSocketTask + private let stream: AsyncThrowingStream + private let continuation: AsyncThrowingStream.Continuation + private let continuationFinished = UnfairLock(false) + private let closeError: UnfairLock + + init(urlSession: URLSession = GenAIURLSession.default, urlRequest: URLRequest) { + webSocketTask = urlSession.webSocketTask(with: urlRequest) + (stream, continuation) = AsyncThrowingStream + .makeStream() + closeError = UnfairLock(nil) + } + + deinit { + disconnect() + } + + /// Starts a connection to the backend, returning a stream for the websocket responses. + func connect() -> AsyncThrowingStream { + webSocketTask.resume() + closeError.withLock { $0 = nil } + startReceiving() + return stream + } + + /// Closes the websocket, if it's not already closed. + func disconnect() { + guard closeError.value() == nil else { return } + + close(code: .goingAway, reason: nil) + } + + /// Sends a message to the server, through the websocket. + /// + /// If the web socket is closed, this method will throw the error it was closed with. + func send(_ message: URLSessionWebSocketTask.Message) async throws { + if let closeError = closeError.value() { + throw closeError + } + try await webSocketTask.send(message) + } + + private func startReceiving() { + Task { + while !Task.isCancelled && self.webSocketTask.isOpen && self.closeError.value() == nil { + do { + let message = try await webSocketTask.receive() + continuation.yield(message) + } catch { + if let error = webSocketTask.error as? NSError { + close( + code: webSocketTask.closeCode, + reason: webSocketTask.closeReason, + underlyingError: error + ) + } else { + close(code: webSocketTask.closeCode, reason: webSocketTask.closeReason) + } + } + } + } + } + + private func close(code: URLSessionWebSocketTask.CloseCode, + reason: Data?, + underlyingError: Error? = nil) { + let error = WebSocketClosedError( + closeCode: code, + closeReason: reason, + underlyingError: underlyingError + ) + closeError.withLock { + $0 = error + } + + webSocketTask.cancel(with: code, reason: reason) + + continuationFinished.withLock { isFinished in + guard !isFinished else { return } + self.continuation.finish(throwing: error) + isFinished = true + } + } +} + +private extension URLSessionWebSocketTask { + var isOpen: Bool { + return closeCode == .invalid + } +} + +/// The websocket was closed. +/// +/// See the `closeReason` for why, or the `errorCode` for the corresponding +/// `URLSessionWebSocketTask.CloseCode`. +/// +/// In some cases, the `NSUnderlyingErrorKey` key may be populated with an +/// error for additional context. +struct WebSocketClosedError: Error, Sendable, CustomNSError { + let closeCode: URLSessionWebSocketTask.CloseCode + let closeReason: String + let underlyingError: Error? + + init(closeCode: URLSessionWebSocketTask.CloseCode, closeReason: Data?, + underlyingError: Error? = nil) { + self.closeCode = closeCode + self.closeReason = closeReason + .flatMap { String(data: $0, encoding: .utf8) } ?? "Unknown reason." + self.underlyingError = underlyingError + } + + var errorCode: Int { closeCode.rawValue } + + var errorUserInfo: [String: Any] { + var userInfo: [String: Any] = [ + NSLocalizedDescriptionKey: "WebSocket closed with code \(closeCode.rawValue). Reason: \(closeReason)", + ] + if let underlyingError { + userInfo[NSUnderlyingErrorKey] = underlyingError + } + return userInfo + } +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentClientContent.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentClientContent.swift new file mode 100644 index 00000000000..459aa258cc3 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentClientContent.swift @@ -0,0 +1,36 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Incremental update of the current conversation delivered from the client. +/// All the content here is unconditionally appended to the conversation +/// history and used as part of the prompt to the model to generate content. +/// +/// A message here will interrupt any current model generation. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct BidiGenerateContentClientContent: Encodable { + /// The content appended to the current conversation with the model. + /// + /// For single-turn queries, this is a single instance. For multi-turn + /// queries, this is a repeated field that contains conversation history and + /// latest request. + let turns: [ModelContent]? + + /// If true, indicates that the server content generation should start with + /// the currently accumulated prompt. Otherwise, the server will await + /// additional messages before starting generation. + let turnComplete: Bool? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentClientMessage.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentClientMessage.swift new file mode 100644 index 00000000000..758d75e2cc7 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentClientMessage.swift @@ -0,0 +1,57 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Messages sent by the client in the BidiGenerateContent RPC call. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +enum BidiGenerateContentClientMessage { + /// Message to be sent in the first and only first client message. + case setup(BidiGenerateContentSetup) + + /// Incremental update of the current conversation delivered from the client. + case clientContent(BidiGenerateContentClientContent) + + /// User input that is sent in real time. + case realtimeInput(BidiGenerateContentRealtimeInput) + + /// Response to a `ToolCallMessage` received from the server. + case toolResponse(BidiGenerateContentToolResponse) +} + +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +extension BidiGenerateContentClientMessage: Encodable { + enum CodingKeys: CodingKey { + case setup + case clientContent + case realtimeInput + case toolResponse + } + + func encode(to encoder: any Encoder) throws { + var container = encoder.container(keyedBy: CodingKeys.self) + switch self { + case let .setup(setup): + try container.encode(setup, forKey: .setup) + case let .clientContent(clientContent): + try container.encode(clientContent, forKey: .clientContent) + case let .realtimeInput(realtimeInput): + try container.encode(realtimeInput, forKey: .realtimeInput) + case let .toolResponse(toolResponse): + try container.encode(toolResponse, forKey: .toolResponse) + } + } +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentRealtimeInput.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentRealtimeInput.swift new file mode 100644 index 00000000000..753a9a3fb15 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentRealtimeInput.swift @@ -0,0 +1,76 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// User input that is sent in real time. +/// +/// This is different from `ClientContentUpdate` in a few ways: +/// +/// - Can be sent continuously without interruption to model generation. +/// - If there is a need to mix data interleaved across the +/// `ClientContentUpdate` and the `RealtimeUpdate`, server attempts to +/// optimize for best response, but there are no guarantees. +/// - End of turn is not explicitly specified, but is rather derived from user +/// activity (for example, end of speech). +/// - Even before the end of turn, the data is processed incrementally +/// to optimize for a fast start of the response from the model. +/// - Is always assumed to be the user's input (cannot be used to populate +/// conversation history). +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct BidiGenerateContentRealtimeInput: Encodable { + /// These form the realtime audio input stream. + let audio: InlineData? + + /// Indicates that the audio stream has ended, e.g. because the microphone was + /// turned off. + /// + /// This should only be sent when automatic activity detection is enabled + /// (which is the default). + /// + /// The client can reopen the stream by sending an audio message. + let audioStreamEnd: Bool? + + /// These form the realtime video input stream. + let video: InlineData? + + /// These form the realtime text input stream. + let text: String? + + /// Marks the start of user activity. + struct ActivityStart: Encodable {} + + /// Marks the start of user activity. This can only be sent if automatic + /// (i.e. server-side) activity detection is disabled. + let activityStart: ActivityStart? + + /// Marks the end of user activity. + struct ActivityEnd: Encodable {} + + /// Marks the end of user activity. This can only be sent if automatic (i.e. + /// server-side) activity detection is disabled. + let activityEnd: ActivityEnd? + + init(audio: InlineData? = nil, video: InlineData? = nil, text: String? = nil, + activityStart: ActivityStart? = nil, activityEnd: ActivityEnd? = nil, + audioStreamEnd: Bool? = nil) { + self.audio = audio + self.video = video + self.text = text + self.activityStart = activityStart + self.activityEnd = activityEnd + self.audioStreamEnd = audioStreamEnd + } +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerContent.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerContent.swift new file mode 100644 index 00000000000..648d7a09ed8 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerContent.swift @@ -0,0 +1,58 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Incremental server update generated by the model in response to client +/// messages. +/// +/// Content is generated as quickly as possible, and not in realtime. Clients +/// may choose to buffer and play it out in realtime. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct BidiGenerateContentServerContent: Decodable, Sendable { + /// The content that the model has generated as part of the current + /// conversation with the user. + let modelTurn: ModelContent? + + /// If true, indicates that the model is done generating. Generation will only + /// start in response to additional client messages. Can be set alongside + /// `content`, indicating that the `content` is the last in the turn. + let turnComplete: Bool? + + /// If true, indicates that a client message has interrupted current model + /// generation. If the client is playing out the content in realtime, this is a + /// good signal to stop and empty the current queue. If the client is playing + /// out the content in realtime, this is a good signal to stop and empty the + /// current playback queue. + let interrupted: Bool? + + /// If true, indicates that the model is done generating. + /// + /// When model is interrupted while generating there will be no + /// 'generation_complete' message in interrupted turn, it will go through + /// 'interrupted > turn_complete'. + /// + /// When model assumes realtime playback there will be delay between + /// generation_complete and turn_complete that is caused by model waiting for + /// playback to finish. + let generationComplete: Bool? + + /// Metadata specifies sources used to ground generated content. + let groundingMetadata: GroundingMetadata? + + let inputTranscription: BidiGenerateContentTranscription? + + let outputTranscription: BidiGenerateContentTranscription? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerMessage.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerMessage.swift new file mode 100644 index 00000000000..8c7c628ebdb --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentServerMessage.swift @@ -0,0 +1,105 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Response message for BidiGenerateContent RPC call. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +@available(watchOS, unavailable) +struct BidiGenerateContentServerMessage: Sendable { + /// The type of the message. + enum MessageType: Sendable { + /// Sent in response to a `BidiGenerateContentSetup` message from the client. + case setupComplete(BidiGenerateContentSetupComplete) + + /// Content generated by the model in response to client messages. + case serverContent(BidiGenerateContentServerContent) + + /// Request for the client to execute the `function_calls` and return the + /// responses with the matching `id`s. + case toolCall(BidiGenerateContentToolCall) + + /// Notification for the client that a previously issued + /// `ToolCallMessage` with the specified `id`s should have been not executed + /// and should be cancelled. + case toolCallCancellation(BidiGenerateContentToolCallCancellation) + + /// Server will disconnect soon. + case goAway(GoAway) + } + + /// The message type. + let messageType: MessageType + + /// Usage metadata about the response(s). + let usageMetadata: GenerateContentResponse.UsageMetadata? +} + +// MARK: - Decodable + +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +@available(watchOS, unavailable) +extension BidiGenerateContentServerMessage: Decodable { + enum CodingKeys: String, CodingKey { + case setupComplete + case serverContent + case toolCall + case toolCallCancellation + case goAway + case usageMetadata + } + + public init(from decoder: any Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + + if let setupComplete = try container.decodeIfPresent( + BidiGenerateContentSetupComplete.self, + forKey: .setupComplete + ) { + messageType = .setupComplete(setupComplete) + } else if let serverContent = try container.decodeIfPresent( + BidiGenerateContentServerContent.self, + forKey: .serverContent + ) { + messageType = .serverContent(serverContent) + } else if let toolCall = try container.decodeIfPresent( + BidiGenerateContentToolCall.self, + forKey: .toolCall + ) { + messageType = .toolCall(toolCall) + } else if let toolCallCancellation = try container.decodeIfPresent( + BidiGenerateContentToolCallCancellation.self, + forKey: .toolCallCancellation + ) { + messageType = .toolCallCancellation(toolCallCancellation) + } else if let goAway = try container.decodeIfPresent(GoAway.self, forKey: .goAway) { + messageType = .goAway(goAway) + } else { + throw InvalidMessageTypeError() + } + + usageMetadata = try container.decodeIfPresent( + GenerateContentResponse.UsageMetadata.self, + forKey: .usageMetadata + ) + } +} + +struct InvalidMessageTypeError: Error, Sendable, CustomNSError { + public var errorUserInfo: [String: Any] { + [ + NSLocalizedDescriptionKey: "Missing server message type.", + ] + } +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetup.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetup.swift new file mode 100644 index 00000000000..15dc8889a0b --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetup.swift @@ -0,0 +1,78 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Message to be sent in the first and only first +/// `BidiGenerateContentClientMessage`. Contains configuration that will apply +/// for the duration of the streaming RPC. +/// +/// Clients should wait for a `BidiGenerateContentSetupComplete` message before +/// sending any additional messages. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct BidiGenerateContentSetup: Encodable { + /// The fully qualified name of the publisher model. + /// + /// Publisher model format: + /// `projects/{project}/locations/{location}/publishers/*/models/*` + let model: String + + /// Generation config. + let generationConfig: BidiGenerationConfig? + + /// The user provided system instructions for the model. + /// Note: only text should be used in parts and content in each part will be + /// in a separate paragraph. + let systemInstruction: ModelContent? + + /// A list of `Tools` the model may use to generate the next response. + /// + /// A `Tool` is a piece of code that enables the system to interact with + /// external systems to perform an action, or set of actions, outside of + /// knowledge and scope of the model. + let tools: [Tool]? + + let toolConfig: ToolConfig? + + /// Input transcription. The transcription is independent to the model turn + /// which means it doesn't imply any ordering between transcription and model + /// turn. + let inputAudioTranscription: BidiAudioTranscriptionConfig? + + /// Output transcription. The transcription is independent to the model turn + /// which means it doesn't imply any ordering between transcription and model + /// turn. + let outputAudioTranscription: BidiAudioTranscriptionConfig? + + init(model: String, + generationConfig: BidiGenerationConfig? = nil, + systemInstruction: ModelContent? = nil, + tools: [Tool]? = nil, + toolConfig: ToolConfig? = nil, + inputAudioTranscription: BidiAudioTranscriptionConfig? = nil, + outputAudioTranscription: BidiAudioTranscriptionConfig? = nil) { + self.model = model + self.generationConfig = generationConfig + self.systemInstruction = systemInstruction + self.tools = tools + self.toolConfig = toolConfig + self.inputAudioTranscription = inputAudioTranscription + self.outputAudioTranscription = outputAudioTranscription + } +} + +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct BidiAudioTranscriptionConfig: Encodable {} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetupComplete.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetupComplete.swift new file mode 100644 index 00000000000..54449782060 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentSetupComplete.swift @@ -0,0 +1,20 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Sent in response to a `BidiGenerateContentSetup` message from the client. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct BidiGenerateContentSetupComplete: Decodable, Sendable {} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolCall.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolCall.swift new file mode 100644 index 00000000000..4c34e6367e9 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolCall.swift @@ -0,0 +1,24 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Request for the client to execute the `function_calls` and return the +/// responses with the matching `id`s. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct BidiGenerateContentToolCall: Decodable, Sendable { + /// The function call to be executed. + let functionCalls: [FunctionCall]? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolCallCancellation.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolCallCancellation.swift new file mode 100644 index 00000000000..48bc991c1fa --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolCallCancellation.swift @@ -0,0 +1,27 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Notification for the client that a previously issued `ToolCallMessage` +/// with the specified `id`s should have been not executed and should be +/// cancelled. If there were side-effects to those tool calls, clients may +/// attempt to undo the tool calls. This message occurs only in cases where the +/// clients interrupt server turns. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct BidiGenerateContentToolCallCancellation: Decodable, Sendable { + /// The ids of the tool calls to be cancelled. + let ids: [String]? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolResponse.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolResponse.swift new file mode 100644 index 00000000000..c9d2506895b --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentToolResponse.swift @@ -0,0 +1,30 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Client generated response to a `ToolCall` received from the server. +/// Individual `FunctionResponse` objects are matched to the respective +/// `FunctionCall` objects by the `id` field. +/// +/// Note that in the unary and server-streaming GenerateContent APIs function +/// calling happens by exchanging the `Content` parts, while in the bidi +/// GenerateContent APIs function calling happens over these dedicated set of +/// messages. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct BidiGenerateContentToolResponse: Encodable { + /// The response to the function calls. + let functionResponses: [FunctionResponse]? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentTranscription.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentTranscription.swift new file mode 100644 index 00000000000..652799edf9d --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerateContentTranscription.swift @@ -0,0 +1,19 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct BidiGenerateContentTranscription: Decodable, Sendable { + let text: String? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiGenerationConfig.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerationConfig.swift new file mode 100644 index 00000000000..a3a3e8a9f99 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiGenerationConfig.swift @@ -0,0 +1,46 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Configuration options for live content generation. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct BidiGenerationConfig: Encodable, Sendable { + let temperature: Float? + let topP: Float? + let topK: Int? + let candidateCount: Int? + let maxOutputTokens: Int? + let presencePenalty: Float? + let frequencyPenalty: Float? + let responseModalities: [ResponseModality]? + let speechConfig: BidiSpeechConfig? + + init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil, + candidateCount: Int? = nil, maxOutputTokens: Int? = nil, + presencePenalty: Float? = nil, frequencyPenalty: Float? = nil, + responseModalities: [ResponseModality]? = nil, + speechConfig: BidiSpeechConfig? = nil) { + self.temperature = temperature + self.topP = topP + self.topK = topK + self.candidateCount = candidateCount + self.maxOutputTokens = maxOutputTokens + self.presencePenalty = presencePenalty + self.frequencyPenalty = frequencyPenalty + self.responseModalities = responseModalities + self.speechConfig = speechConfig + } +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/BidiSpeechConfig.swift b/FirebaseAI/Sources/Types/Internal/Live/BidiSpeechConfig.swift new file mode 100644 index 00000000000..80e7d341ef7 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/BidiSpeechConfig.swift @@ -0,0 +1,31 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Speech generation config. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct BidiSpeechConfig: Encodable, Sendable { + /// The configuration for the speaker to use. + let voiceConfig: VoiceConfig + + /// Language code (ISO 639. e.g. en-US) for the speech synthesization. + let languageCode: String? + + init(voiceConfig: VoiceConfig, languageCode: String?) { + self.voiceConfig = voiceConfig + self.languageCode = languageCode + } +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/GoAway.swift b/FirebaseAI/Sources/Types/Internal/Live/GoAway.swift new file mode 100644 index 00000000000..f5c858b8b45 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/GoAway.swift @@ -0,0 +1,25 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Server will not be able to service client soon. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct GoAway: Decodable, Sendable { + /// The remaining time before the connection will be terminated as ABORTED. + /// The minimal time returned here is specified differently together with + /// the rate limits for a given model. + let timeLeft: ProtoDuration? +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/LiveSessionService.swift b/FirebaseAI/Sources/Types/Internal/Live/LiveSessionService.swift new file mode 100644 index 00000000000..42f8364b90f --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/LiveSessionService.swift @@ -0,0 +1,395 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +// TODO: remove @preconcurrency when we update to Swift 6 +// for context, see +// https://forums.swift.org/t/why-does-sending-a-sendable-value-risk-causing-data-races/73074 +@preconcurrency import FirebaseAppCheckInterop +@preconcurrency import FirebaseAuthInterop + +/// Facilitates communication with the backend for a ``LiveSession``. +/// +/// Using an actor will make it easier to adopt session resumption, as we have an isolated place for +/// mainting mutablity, which is backed by Swift concurrency implicity; allowing us to avoid various +/// edge-case issues with dead-locks and data races. +/// +/// This mainly comes into play when we don't want to block developers from sending messages while a +/// session is being reloaded. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +@available(watchOS, unavailable) +actor LiveSessionService { + let responses: AsyncThrowingStream + private let responseContinuation: AsyncThrowingStream + .Continuation + + // to ensure messages are sent in order, since swift actors are reentrant + private let messageQueue: AsyncStream + private let messageQueueContinuation: AsyncStream.Continuation + + let modelResourceName: String + let generationConfig: LiveGenerationConfig? + let urlSession: URLSession + let apiConfig: APIConfig + let firebaseInfo: FirebaseInfo + let requestOptions: RequestOptions + let tools: [Tool]? + let toolConfig: ToolConfig? + let systemInstruction: ModelContent? + + var webSocket: AsyncWebSocket? + + private let jsonEncoder = JSONEncoder() + private let jsonDecoder = JSONDecoder() + + /// Task that doesn't complete until the server sends a setupComplete message. + /// + /// Used to hold off on sending messages until the server is ready. + private var setupTask: Task + + /// Long running task that that wraps around the websocket, propogating messages through the + /// public stream. + private var responsesTask: Task? + + /// Long running task that consumes user messages from the ``messageQueue`` and sends them through + /// the websocket. + private var messageQueueTask: Task? + + init(modelResourceName: String, + generationConfig: LiveGenerationConfig?, + urlSession: URLSession, + apiConfig: APIConfig, + firebaseInfo: FirebaseInfo, + tools: [Tool]?, + toolConfig: ToolConfig?, + systemInstruction: ModelContent?, + requestOptions: RequestOptions) { + (responses, responseContinuation) = AsyncThrowingStream.makeStream() + (messageQueue, messageQueueContinuation) = AsyncStream.makeStream() + self.modelResourceName = modelResourceName + self.generationConfig = generationConfig + self.urlSession = urlSession + self.apiConfig = apiConfig + self.firebaseInfo = firebaseInfo + self.tools = tools + self.toolConfig = toolConfig + self.systemInstruction = systemInstruction + self.requestOptions = requestOptions + setupTask = Task {} + } + + deinit { + setupTask.cancel() + responsesTask?.cancel() + messageQueueTask?.cancel() + webSocket?.disconnect() + + webSocket = nil + responsesTask = nil + messageQueueTask = nil + } + + /// Queue a message to be sent to the model. + /// + /// If there's any issues while sending the message, details about the issue will be logged. + /// + /// Since messages are queued synchronously, they are sent in-order. + func send(_ message: BidiGenerateContentClientMessage) { + messageQueueContinuation.yield(message) + } + + /// Start a new connection to the backend. + /// + /// Seperated into its own function to make it easier to surface a way to call it seperately when + /// resuming the same session. + func connect() async throws { + close() + // we launch the setup task in a seperate task to allow us to cancel it via close + setupTask = Task { [weak self] in + // we need a continuation to surface that the setup is complete, while still allowing us to + // listen to the server + try await withCheckedThrowingContinuation { setupContinuation in + // nested task so we can use await + Task { [weak self] in + guard let self else { return } + await self.listenToServer(setupContinuation) + } + } + } + + try await setupTask.value + } + + /// Cancel any running tasks and close the websocket. + /// + /// This method is idempotent; if it's already ran once, it will effectively be a no-op. + func close() { + setupTask.cancel() + responsesTask?.cancel() + messageQueueTask?.cancel() + webSocket?.disconnect() + + webSocket = nil + responsesTask = nil + messageQueueTask = nil + } + + /// Start a fresh websocket to the backend, and listen for responses. + /// + /// Will hold off on sending any messages until the server sends a setupComplete message. + /// + /// Will also close out the old websocket and the previous long running tasks. + private func listenToServer(_ setupComplete: CheckedContinuation) async { + do { + webSocket = try await createWebsocket() + } catch { + let error = LiveSessionSetupError(underlyingError: error) + close() + setupComplete.resume(throwing: error) + return + } + + guard let webSocket else { return } + let stream = webSocket.connect() + + var resumed = false + + // remove the uncommon (and unexpected) responses from the stream, to make normal path cleaner + let dataStream = stream.compactMap { (message: URLSessionWebSocketTask.Message) -> Data? in + switch message { + case let .string(string): + AILog.error(code: .liveSessionUnexpectedResponse, "Unexpected string response: \(string)") + case let .data(data): + return data + @unknown default: + AILog.error(code: .liveSessionUnexpectedResponse, "Unknown message received: \(message)") + } + return nil + } + + do { + let setup = BidiGenerateContentSetup( + model: modelResourceName, + generationConfig: generationConfig?.bidiGenerationConfig, + systemInstruction: systemInstruction, + tools: tools, + toolConfig: toolConfig, + inputAudioTranscription: generationConfig?.inputAudioTranscription, + outputAudioTranscription: generationConfig?.outputAudioTranscription + ) + let data = try jsonEncoder.encode(BidiGenerateContentClientMessage.setup(setup)) + try await webSocket.send(.data(data)) + } catch { + let error = LiveSessionSetupError(underlyingError: error) + close() + setupComplete.resume(throwing: error) + return + } + + responsesTask = Task { + do { + for try await message in dataStream { + let response: BidiGenerateContentServerMessage + do { + response = try jsonDecoder.decode( + BidiGenerateContentServerMessage.self, + from: message + ) + } catch { + // only log the json if it wasn't a decoding error, but an unsupported message type + if error is InvalidMessageTypeError { + AILog.error( + code: .liveSessionUnsupportedMessage, + "The server sent a message that we don't currently have a mapping for." + ) + + AILog.debug( + code: .liveSessionUnsupportedMessagePayload, + message.encodeToJsonString() ?? "\(message)" + ) + } + + let error = LiveSessionUnsupportedMessageError(underlyingError: error) + // if we've already finished setting up, then only surface the error through responses + // otherwise, make the setup task error as well + if !resumed { + setupComplete.resume(throwing: error) + } + throw error + } + + if case .setupComplete = response.messageType { + if resumed { + AILog.debug( + code: .duplicateLiveSessionSetupComplete, + "Setup complete was received multiple times; this may be a bug in the model." + ) + } else { + // calling resume multiple times is an error in swift, so we catch multiple calls + // to avoid causing any issues due to model quirks + resumed = true + setupComplete.resume() + } + } else if let liveMessage = LiveServerMessage(from: response) { + if case let .goingAwayNotice(message) = liveMessage.payload { + // TODO: (b/444045023) When auto session resumption is enabled, call `connect` again + AILog.debug( + code: .liveSessionGoingAwaySoon, + "Session expires in: \(message.goAway.timeLeft?.timeInterval ?? 0)" + ) + } + + responseContinuation.yield(liveMessage) + } + } + } catch { + if let error = error as? WebSocketClosedError { + // only raise an error if the session didn't close normally (ie; the user calling close) + if error.closeCode != .goingAway { + let closureError: Error + if let error = error.underlyingError as? NSError, error.domain == NSURLErrorDomain, + error.code == NSURLErrorNetworkConnectionLost { + closureError = LiveSessionLostConnectionError(underlyingError: error) + } else { + closureError = LiveSessionUnexpectedClosureError(underlyingError: error) + } + close() + responseContinuation.finish(throwing: closureError) + } + } else { + // an error occurred outside the websocket, so it's likely not closed + close() + responseContinuation.finish(throwing: error) + } + } + } + + messageQueueTask = Task { + for await message in messageQueue { + // we don't propogate errors, since those are surfaced in the responses stream + guard let _ = try? await setupTask.value else { + break + } + + let data: Data + do { + data = try jsonEncoder.encode(message) + } catch { + AILog.error(code: .liveSessionFailedToEncodeClientMessage, error.localizedDescription) + AILog.debug( + code: .liveSessionFailedToEncodeClientMessagePayload, + String(describing: message) + ) + continue + } + + do { + try await webSocket.send(.data(data)) + } catch { + AILog.error(code: .liveSessionFailedToSendClientMessage, error.localizedDescription) + } + } + } + } + + /// Creates a websocket pointing to the backend. + /// + /// Will apply the required app check and auth headers, as the backend expects them. + private nonisolated func createWebsocket() async throws -> AsyncWebSocket { + let host = apiConfig.service.endpoint.rawValue.withoutPrefix("https://") + // TODO: (b/448722577) Set a location based on the api config + let urlString = switch apiConfig.service { + case .vertexAI: + "wss://\(host)/ws/google.firebase.vertexai.v1beta.LlmBidiService/BidiGenerateContent/locations/us-central1" + case .googleAI: + "wss://\(host)/ws/google.firebase.vertexai.v1beta.GenerativeService/BidiGenerateContent" + } + guard let url = URL(string: urlString) else { + throw NSError( + domain: "\(Constants.baseErrorDomain).\(Self.self)", + code: AILog.MessageCode.invalidWebsocketURL.rawValue, + userInfo: [ + NSLocalizedDescriptionKey: "The live API websocket URL is not a valid URL", + ] + ) + } + var urlRequest = URLRequest(url: url) + urlRequest.timeoutInterval = requestOptions.timeout + urlRequest.setValue(firebaseInfo.apiKey, forHTTPHeaderField: "x-goog-api-key") + urlRequest.setValue( + "\(GenerativeAIService.languageTag) \(GenerativeAIService.firebaseVersionTag)", + forHTTPHeaderField: "x-goog-api-client" + ) + urlRequest.setValue("application/json", forHTTPHeaderField: "Content-Type") + + if let appCheck = firebaseInfo.appCheck { + let tokenResult = try await appCheck.fetchAppCheckToken( + limitedUse: firebaseInfo.useLimitedUseAppCheckTokens, + domain: "LiveSessionService" + ) + urlRequest.setValue(tokenResult.token, forHTTPHeaderField: "X-Firebase-AppCheck") + if let error = tokenResult.error { + AILog.error( + code: .appCheckTokenFetchFailed, + "Failed to fetch AppCheck token. Error: \(error)" + ) + } + } + + if let auth = firebaseInfo.auth, let authToken = try await auth.getToken( + forcingRefresh: false + ) { + urlRequest.setValue("Firebase \(authToken)", forHTTPHeaderField: "Authorization") + } + + if firebaseInfo.app.isDataCollectionDefaultEnabled { + urlRequest.setValue(firebaseInfo.firebaseAppID, forHTTPHeaderField: "X-Firebase-AppId") + if let appVersion = Bundle.main.infoDictionary?["CFBundleShortVersionString"] as? String { + urlRequest.setValue(appVersion, forHTTPHeaderField: "X-Firebase-AppVersion") + } + } + + return AsyncWebSocket(urlSession: urlSession, urlRequest: urlRequest) + } +} + +private extension Data { + /// Encodes this into a raw json string, with no regard to specific keys. + /// + /// Will return `nil` if this data doesn't represent a valid json object. + func encodeToJsonString() -> String? { + do { + let object = try JSONSerialization.jsonObject(with: self) + let data = try JSONSerialization.data(withJSONObject: object) + + return String(data: data, encoding: .utf8) + } catch { + return nil + } + } +} + +private extension String { + /// Create a new string with the given prefix removed, if it's present. + /// + /// If the prefix isn't present, this string will be returned instead. + func withoutPrefix(_ prefix: String) -> String { + if let index = range(of: prefix, options: .anchored) { + return String(self[index.upperBound...]) + } else { + return self + } + } +} diff --git a/FirebaseAI/Sources/Types/Internal/Live/VoiceConfig.swift b/FirebaseAI/Sources/Types/Internal/Live/VoiceConfig.swift new file mode 100644 index 00000000000..0e6790c03f2 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/Live/VoiceConfig.swift @@ -0,0 +1,74 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Configuration for the speaker to use. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +enum VoiceConfig { + /// Configuration for the prebuilt voice to use. + case prebuiltVoiceConfig(PrebuiltVoiceConfig) + + /// Configuration for the custom voice to use. + case customVoiceConfig(CustomVoiceConfig) +} + +/// The configuration for the prebuilt speaker to use. +/// +/// Not just a string on the parent proto, because there'll likely be a lot +/// more options here. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct PrebuiltVoiceConfig: Encodable, Sendable { + /// The name of the preset voice to use. + let voiceName: String + + init(voiceName: String) { + self.voiceName = voiceName + } +} + +/// The configuration for the custom voice to use. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +struct CustomVoiceConfig: Encodable, Sendable { + /// The sample of the custom voice, in pcm16 s16e format. + let customVoiceSample: Data + + init(customVoiceSample: Data) { + self.customVoiceSample = customVoiceSample + } +} + +// MARK: - Encodable conformance + +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +extension VoiceConfig: Encodable { + enum CodingKeys: CodingKey { + case prebuiltVoiceConfig + case customVoiceConfig + } + + func encode(to encoder: any Encoder) throws { + var container = encoder.container(keyedBy: CodingKeys.self) + switch self { + case let .prebuiltVoiceConfig(setup): + try container.encode(setup, forKey: .prebuiltVoiceConfig) + case let .customVoiceConfig(clientContent): + try container.encode(clientContent, forKey: .customVoiceConfig) + } + } +} diff --git a/FirebaseAI/Sources/Types/Internal/ProtoDuration.swift b/FirebaseAI/Sources/Types/Internal/ProtoDuration.swift new file mode 100644 index 00000000000..1dac21d6429 --- /dev/null +++ b/FirebaseAI/Sources/Types/Internal/ProtoDuration.swift @@ -0,0 +1,112 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Represents a signed, fixed-length span of time represented +/// as a count of seconds and fractions of seconds at nanosecond +/// resolution. +/// +/// This represents a +/// [`google.protobuf.duration`](https://protobuf.dev/reference/protobuf/google.protobuf/#duration). +struct ProtoDuration { + /// Signed seconds of the span of time. + /// + /// Must be from -315,576,000,000 to +315,576,000,000 inclusive. + /// + /// Note: these bounds are computed from: + /// 60 sec/min * 60 min/hr * 24 hr/day * 365.25 days/year * 10000 years + let seconds: Int64 + + /// Signed fractions of a second at nanosecond resolution of the span of time. + /// + /// Durations less than one second are represented with a 0 + /// `seconds` field and a positive or negative `nanos` field. + /// + /// For durations of one second or more, a non-zero value for the `nanos` field must be + /// of the same sign as the `seconds` field. Must be from -999,999,999 + /// to +999,999,999 inclusive. + let nanos: Int32 + + /// Returns a `TimeInterval` representation of the `ProtoDuration`. + var timeInterval: TimeInterval { + return TimeInterval(seconds) + TimeInterval(nanos) / 1_000_000_000 + } +} + +// MARK: - Codable Conformance + +extension ProtoDuration: Decodable { + init(from decoder: any Decoder) throws { + var text = try decoder.singleValueContainer().decode(String.self) + if text.last != "s" { + AILog.warning( + code: .decodedMissingProtoDurationSuffix, + "Missing 's' at end of proto duration: \(text)." + ) + } else { + text.removeLast() + } + + let seconds: String + let nanoseconds: String + + let maybeSplit = text.split(separator: ".") + if maybeSplit.count > 2 { + AILog.warning( + code: .decodedInvalidProtoDurationString, + "Too many decimal places in proto duration (expected only 1): \(maybeSplit)." + ) + throw DecodingError.dataCorrupted(.init( + codingPath: [], + debugDescription: "Invalid proto duration string: \(text)" + )) + } + + if maybeSplit.count == 2 { + seconds = String(maybeSplit[0]) + nanoseconds = String(maybeSplit[1]) + } else { + seconds = text + nanoseconds = "0" + } + + guard let secs = Int64(seconds) else { + AILog.warning( + code: .decodedInvalidProtoDurationSeconds, + "Failed to parse the seconds to an Int64: \(seconds)." + ) + + throw DecodingError.dataCorrupted(.init( + codingPath: [], + debugDescription: "Invalid proto duration seconds: \(text)" + )) + } + + guard let nanos = Int32(nanoseconds) else { + AILog.warning( + code: .decodedInvalidProtoDurationNanoseconds, + "Failed to parse the nanoseconds to an Int32: \(nanoseconds)." + ) + + throw DecodingError.dataCorrupted(.init( + codingPath: [], + debugDescription: "Invalid proto duration nanoseconds: \(text)" + )) + } + + self.seconds = secs + self.nanos = nanos + } +} diff --git a/FirebaseAI/Sources/Types/Public/Live/AudioTranscriptionConfig.swift b/FirebaseAI/Sources/Types/Public/Live/AudioTranscriptionConfig.swift new file mode 100644 index 00000000000..365afebc5da --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/AudioTranscriptionConfig.swift @@ -0,0 +1,33 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// Configuration options for audio transcriptions when communicating with a model that supports the +/// Gemini Live API. +/// +/// While there are not currently any options, this will likely change in the future. For now, just +/// providing an instance of this struct will enable audio transcriptions for the corresponding +/// input or output fields. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +public struct AudioTranscriptionConfig: Sendable { + let audioTranscriptionConfig: BidiAudioTranscriptionConfig + + init(_ audioTranscriptionConfig: BidiAudioTranscriptionConfig) { + self.audioTranscriptionConfig = audioTranscriptionConfig + } + + public init() { + self.init(BidiAudioTranscriptionConfig()) + } +} diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveAudioTranscription.swift b/FirebaseAI/Sources/Types/Public/Live/LiveAudioTranscription.swift new file mode 100644 index 00000000000..76dc112ee03 --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/LiveAudioTranscription.swift @@ -0,0 +1,26 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// Text transcription of some audio form during a live interaction with the model. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +public struct LiveAudioTranscription: Sendable { + let transcript: BidiGenerateContentTranscription + /// Text representing the model's interpretation of what the audio said. + public var text: String? { transcript.text } + + init(_ transcript: BidiGenerateContentTranscription) { + self.transcript = transcript + } +} diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveGenerationConfig.swift b/FirebaseAI/Sources/Types/Public/Live/LiveGenerationConfig.swift new file mode 100644 index 00000000000..21692f27eed --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/LiveGenerationConfig.swift @@ -0,0 +1,152 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Configuration options for live content generation. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +public struct LiveGenerationConfig: Sendable { + let bidiGenerationConfig: BidiGenerationConfig + let inputAudioTranscription: BidiAudioTranscriptionConfig? + let outputAudioTranscription: BidiAudioTranscriptionConfig? + + /// Creates a new ``LiveGenerationConfig`` value. + /// + /// See the + /// [Configure model parameters](https://firebase.google.com/docs/vertex-ai/model-parameters) + /// guide and the + /// [Cloud documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#generationconfig) + /// for more details. + /// + /// - Parameters: + /// - temperature:Controls the randomness of the language model's output. Higher values (for + /// example, 1.0) make the text more random and creative, while lower values (for example, + /// 0.1) make it more focused and deterministic. + /// + /// > Note: A temperature of 0 means that the highest probability tokens are always selected. + /// > In this case, responses for a given prompt are mostly deterministic, but a small amount + /// > of variation is still possible. + /// + /// > Important: The range of supported temperature values depends on the model; see the + /// > [documentation](https://firebase.google.com/docs/vertex-ai/model-parameters?platform=ios#temperature) + /// > for more details. + /// - topP: Controls diversity of generated text. Higher values (e.g., 0.9) produce more diverse + /// text, while lower values (e.g., 0.5) make the output more focused. + /// + /// The supported range is 0.0 to 1.0. + /// + /// > Important: The default `topP` value depends on the model; see the + /// > [documentation](https://firebase.google.com/docs/vertex-ai/model-parameters?platform=ios#top-p) + /// > for more details. + /// - topK: Limits the number of highest probability words the model considers when generating + /// text. For example, a topK of 40 means only the 40 most likely words are considered for the + /// next token. A higher value increases diversity, while a lower value makes the output more + /// deterministic. + /// + /// The supported range is 1 to 40. + /// + /// > Important: Support for `topK` and the default value depends on the model; see the + /// [documentation](https://firebase.google.com/docs/vertex-ai/model-parameters?platform=ios#top-k) + /// for more details. + /// - candidateCount: The number of response variations to return; defaults to 1 if not set. + /// Support for multiple candidates depends on the model; see the + /// [Cloud documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#generationconfig) + /// for more details. + /// - maxOutputTokens: Maximum number of tokens that can be generated in the response. + /// See the configure model parameters [documentation](https://firebase.google.com/docs/vertex-ai/model-parameters?platform=ios#max-output-tokens) + /// for more details. + /// - presencePenalty: Controls the likelihood of repeating the same words or phrases already + /// generated in the text. Higher values increase the penalty of repetition, resulting in more + /// diverse output. + /// + /// > Note: While both `presencePenalty` and `frequencyPenalty` discourage repetition, + /// > `presencePenalty` applies the same penalty regardless of how many times the word/phrase + /// > has already appeared, whereas `frequencyPenalty` increases the penalty for *each* + /// > repetition of a word/phrase. + /// + /// > Important: The range of supported `presencePenalty` values depends on the model; see the + /// > [Cloud documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#generationconfig) + /// > for more details + /// - frequencyPenalty: Controls the likelihood of repeating words or phrases, with the penalty + /// increasing for each repetition. Higher values increase the penalty of repetition, + /// resulting in more diverse output. + /// + /// > Note: While both `frequencyPenalty` and `presencePenalty` discourage repetition, + /// > `frequencyPenalty` increases the penalty for *each* repetition of a word/phrase, whereas + /// > `presencePenalty` applies the same penalty regardless of how many times the word/phrase + /// > has already appeared. + /// + /// > Important: The range of supported `frequencyPenalty` values depends on the model; see + /// > the + /// > [Cloud documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#generationconfig) + /// > for more details + /// - responseModalities: The data types (modalities) that may be returned in model responses. + /// + /// See the [multimodal + /// responses](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal-response-generation) + /// documentation for more details. + /// + /// > Warning: Specifying response modalities is a **Public Preview** feature, which means + /// > that it is not subject to any SLA or deprecation policy and could change in + /// > backwards-incompatible ways. + /// - speech: Controls the voice of the model, when streaming `audio` via + /// ``ResponseModality``. + /// - inputAudioTranscription: Configures (and enables) input transcriptions when streaming to + /// the model. + /// + /// Input transcripts are the model's interpretation of audio data sent to it, and they are + /// populated in model responses via ``LiveServerContent``. When this field is set to `nil`, + /// input transcripts are not populated in model responses. + /// - outputAudioTranscription: Configures (and enables) output transcriptions when streaming to + /// the model. + /// + /// Output transcripts are text representations of the audio the model is sending to the + /// client, and they are populated in model responses via ``LiveServerContent``. When this + /// field is set to `nil`, output transcripts are not populated in model responses. + /// + /// > Important: Transcripts are independent to the model turn. This means transcripts may + /// > come earlier or later than when the model sends the corresponding audio responses. + public init(temperature: Float? = nil, topP: Float? = nil, topK: Int? = nil, + candidateCount: Int? = nil, maxOutputTokens: Int? = nil, + presencePenalty: Float? = nil, frequencyPenalty: Float? = nil, + responseModalities: [ResponseModality]? = nil, + speech: SpeechConfig? = nil, + inputAudioTranscription: AudioTranscriptionConfig? = nil, + outputAudioTranscription: AudioTranscriptionConfig? = nil) { + self.init( + BidiGenerationConfig( + temperature: temperature, + topP: topP, + topK: topK, + candidateCount: candidateCount, + maxOutputTokens: maxOutputTokens, + presencePenalty: presencePenalty, + frequencyPenalty: frequencyPenalty, + responseModalities: responseModalities, + speechConfig: speech?.speechConfig + ), + inputAudioTranscription: inputAudioTranscription?.audioTranscriptionConfig, + outputAudioTranscription: outputAudioTranscription?.audioTranscriptionConfig + ) + } + + init(_ bidiGenerationConfig: BidiGenerationConfig, + inputAudioTranscription: BidiAudioTranscriptionConfig? = nil, + outputAudioTranscription: BidiAudioTranscriptionConfig? = nil) { + self.bidiGenerationConfig = bidiGenerationConfig + self.inputAudioTranscription = inputAudioTranscription + self.outputAudioTranscription = outputAudioTranscription + } +} diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveGenerativeModel.swift b/FirebaseAI/Sources/Types/Public/Live/LiveGenerativeModel.swift new file mode 100644 index 00000000000..a9168789ff3 --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/LiveGenerativeModel.swift @@ -0,0 +1,74 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// A multimodal model (like Gemini) capable of real-time content generation based on +/// various input types, supporting bidirectional streaming. +/// +/// You can create a new session via ``connect()``. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +public final class LiveGenerativeModel { + let modelResourceName: String + let firebaseInfo: FirebaseInfo + let apiConfig: APIConfig + let generationConfig: LiveGenerationConfig? + let tools: [Tool]? + let toolConfig: ToolConfig? + let systemInstruction: ModelContent? + let urlSession: URLSession + let requestOptions: RequestOptions + + init(modelResourceName: String, + firebaseInfo: FirebaseInfo, + apiConfig: APIConfig, + generationConfig: LiveGenerationConfig? = nil, + tools: [Tool]? = nil, + toolConfig: ToolConfig? = nil, + systemInstruction: ModelContent? = nil, + urlSession: URLSession = GenAIURLSession.default, + requestOptions: RequestOptions) { + self.modelResourceName = modelResourceName + self.firebaseInfo = firebaseInfo + self.apiConfig = apiConfig + self.generationConfig = generationConfig + self.tools = tools + self.toolConfig = toolConfig + self.systemInstruction = systemInstruction + self.urlSession = urlSession + self.requestOptions = requestOptions + } + + /// Start a ``LiveSession`` with the server for bidirectional streaming. + /// + /// - Returns: A new ``LiveSession`` that you can use to stream messages to and from the server. + public func connect() async throws -> LiveSession { + let service = LiveSessionService( + modelResourceName: modelResourceName, + generationConfig: generationConfig, + urlSession: urlSession, + apiConfig: apiConfig, + firebaseInfo: firebaseInfo, + tools: tools, + toolConfig: toolConfig, + systemInstruction: systemInstruction, + requestOptions: requestOptions + ) + + try await service.connect() + + return LiveSession(service: service) + } +} diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveServerContent.swift b/FirebaseAI/Sources/Types/Public/Live/LiveServerContent.swift new file mode 100644 index 00000000000..25e29e4b891 --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/LiveServerContent.swift @@ -0,0 +1,82 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// Incremental server update generated by the model in response to client +/// messages. +/// +/// Content is generated as quickly as possible, and not in realtime. Clients +/// may choose to buffer and play it out in realtime. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +public struct LiveServerContent: Sendable { + let serverContent: BidiGenerateContentServerContent + + /// The content that the model has generated as part of the current + /// conversation with the user. + /// + /// This can be `nil` if the message signifies something else (such + /// as the turn ending). + public var modelTurn: ModelContent? { serverContent.modelTurn } + + /// The model has finished sending data in the current turn. + /// + /// Generation will only start in response to additional client messages. + /// + /// Can be set alongside ``modelTurn``, indicating that the content is the last in the turn. + public var isTurnComplete: Bool { serverContent.turnComplete ?? false } + + /// The model was interrupted by a client message while generating data. + /// + /// If the client is playing out the content in realtime, this is a + /// good signal to stop and empty the current queue. + public var wasInterrupted: Bool { serverContent.interrupted ?? false } + + /// The model has finished _generating_ data for the current turn. + /// + /// For realtime playback, there will be a delay between when the model finishes generating + /// content and the client has finished playing back the generated content. `generationComplete` + /// indicates that the model is done generating data, while `isTurnComplete` indicates the model + /// is waiting for additional client messages. Sending a message during this delay may cause a + /// `wasInterrupted` message to be sent. + /// + /// Note that if the model `wasInterrupted`, this will not be set. The model will go from + /// `wasInterrupted` -> `turnComplete`. + public var isGenerationComplete: Bool { serverContent.generationComplete ?? false } + + /// Metadata specifying the sources used to ground generated content. + public var groundingMetadata: GroundingMetadata? { serverContent.groundingMetadata } + + /// The model's interpretation of what the client said in an audio message. + /// + /// This field is only populated when an ``AudioTranscriptionConfig`` is provided to + /// ``LiveGenerationConfig``. + public var inputAudioTranscription: LiveAudioTranscription? { + serverContent.inputTranscription.map { LiveAudioTranscription($0) } + } + + /// Transcription matching the model's audio response. + /// + /// This field is only populated when an ``AudioTranscriptionConfig`` is provided to + /// ``LiveGenerationConfig``. + /// + /// > Important: Transcripts are independent to the model turn. This means transcripts may + /// > come earlier or later than when the model sends the corresponding audio responses. + public var outputAudioTranscription: LiveAudioTranscription? { + serverContent.outputTranscription.map { LiveAudioTranscription($0) } + } + + init(_ serverContent: BidiGenerateContentServerContent) { + self.serverContent = serverContent + } +} diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveServerGoingAwayNotice.swift b/FirebaseAI/Sources/Types/Public/Live/LiveServerGoingAwayNotice.swift new file mode 100644 index 00000000000..981ddf0c251 --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/LiveServerGoingAwayNotice.swift @@ -0,0 +1,33 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Server will not be able to service client soon. +/// +/// To learn more about session limits, see the docs on [Maximum session duration](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-live#maximum-session-duration)\. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +public struct LiveServerGoingAwayNotice: Sendable { + let goAway: GoAway + /// The remaining time before the connection will be terminated as ABORTED. + /// + /// The minimal time returned here is specified differently together with + /// the rate limits for a given model. + public var timeLeft: TimeInterval? { goAway.timeLeft?.timeInterval } + + init(_ goAway: GoAway) { + self.goAway = goAway + } +} diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveServerMessage.swift b/FirebaseAI/Sources/Types/Public/Live/LiveServerMessage.swift new file mode 100644 index 00000000000..5868efca07f --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/LiveServerMessage.swift @@ -0,0 +1,77 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// Update from the server, generated from the model in response to client messages. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +public struct LiveServerMessage: Sendable { + let serverMessage: BidiGenerateContentServerMessage + + /// The type of message sent from the server. + public enum Payload: Sendable { + /// Content generated by the model in response to client messages. + case content(LiveServerContent) + + /// Request for the client to execute the provided functions. + case toolCall(LiveServerToolCall) + + /// Notification for the client that a previously issued ``LiveServerToolCall`` should be + /// cancelled. + case toolCallCancellation(LiveServerToolCallCancellation) + + /// Server will disconnect soon. + case goingAwayNotice(LiveServerGoingAwayNotice) + } + + /// The message sent from the server. + public let payload: Payload + + /// Metadata on the usage of the cached content. + public var usageMetadata: GenerateContentResponse.UsageMetadata? { serverMessage.usageMetadata } +} + +// MARK: - Internal parsing + +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +extension LiveServerMessage { + init?(from serverMessage: BidiGenerateContentServerMessage) { + guard let payload = LiveServerMessage.Payload(from: serverMessage.messageType) else { + return nil + } + + self.serverMessage = serverMessage + self.payload = payload + } +} + +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +extension LiveServerMessage.Payload { + init?(from serverMessage: BidiGenerateContentServerMessage.MessageType) { + switch serverMessage { + case .setupComplete: + // this is handled internally, and should not be surfaced to users + return nil + case let .serverContent(msg): + self = .content(LiveServerContent(msg)) + case let .toolCall(msg): + self = .toolCall(LiveServerToolCall(msg)) + case let .toolCallCancellation(msg): + self = .toolCallCancellation(LiveServerToolCallCancellation(msg)) + case let .goAway(msg): + self = .goingAwayNotice(LiveServerGoingAwayNotice(msg)) + } + } +} diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveServerToolCall.swift b/FirebaseAI/Sources/Types/Public/Live/LiveServerToolCall.swift new file mode 100644 index 00000000000..7209e312c76 --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/LiveServerToolCall.swift @@ -0,0 +1,32 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// Request for the client to execute the provided ``functionCalls``. +/// +/// The client should return matching ``FunctionResponsePart``, where the `functionId` fields +/// correspond to individual ``FunctionCallPart``s. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +@available(watchOS, unavailable) +public struct LiveServerToolCall: Sendable { + let serverToolCall: BidiGenerateContentToolCall + + /// A list of ``FunctionCallPart`` to run and return responses for. + public var functionCalls: [FunctionCallPart]? { + serverToolCall.functionCalls?.map { FunctionCallPart($0) } + } + + init(_ serverToolCall: BidiGenerateContentToolCall) { + self.serverToolCall = serverToolCall + } +} diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveServerToolCallCancellation.swift b/FirebaseAI/Sources/Types/Public/Live/LiveServerToolCallCancellation.swift new file mode 100644 index 00000000000..ca7973c64b7 --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/LiveServerToolCallCancellation.swift @@ -0,0 +1,30 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// Notification for the client to cancel a previous function call from ``LiveServerToolCall``. +/// +/// The client does not need to send ``FunctionResponsePart``s for the cancelled +/// ``FunctionCallPart``s. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +public struct LiveServerToolCallCancellation: Sendable { + let serverToolCallCancellation: BidiGenerateContentToolCallCancellation + /// A list of `functionId`s matching the `functionId` provided in a previous + /// ``LiveServerToolCall``, where only the provided `functionId`s should be cancelled. + public var ids: [String]? { serverToolCallCancellation.ids } + + init(_ serverToolCallCancellation: BidiGenerateContentToolCallCancellation) { + self.serverToolCallCancellation = serverToolCallCancellation + } +} diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveSession.swift b/FirebaseAI/Sources/Types/Public/Live/LiveSession.swift new file mode 100644 index 00000000000..3e5e6923a59 --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/LiveSession.swift @@ -0,0 +1,140 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// A live WebSocket session, capable of streaming content to and from the model. +/// +/// Messages are streamed through ``responses``, and can be sent through either the dedicated +/// realtime API function (such as ``sendAudioRealtime(audio:)`` or ``sendTextRealtime(text:)``), or +/// through the incremental API (such as ``sendContent(_:turnComplete:)``). +/// +/// To create an instance of this class, see ``LiveGenerativeModel``. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, watchOS 8.0, *) +@available(watchOS, unavailable) +public final class LiveSession: Sendable { + private let service: LiveSessionService + + /// An asyncronous stream of messages from the server. + /// + /// These messages from the incremental updates from the model, for the current conversation. + public var responses: AsyncThrowingStream { service.responses } + + init(service: LiveSessionService) { + self.service = service + } + + /// Response to a ``LiveServerToolCall`` received from the server. + /// + /// This method is used both for the realtime API and the incremental API. + /// + /// - Parameters: + /// - responses: Client generated function results, matched to their respective + /// ``FunctionCallPart`` by the `functionId` field. + public func sendFunctionResponses(_ responses: [FunctionResponsePart]) async { + let message = BidiGenerateContentToolResponse( + functionResponses: responses.map { $0.functionResponse } + ) + await service.send(.toolResponse(message)) + } + + /// Sends an audio input stream to the model, using the realtime API. + /// + /// To learn more about audio formats, and the required state they should be provided in, see the + /// docs on + /// [Supported audio formats](https://cloud.google.com/vertex-ai/generative-ai/docs/live-api#supported-audio-formats). + /// + /// - Parameters: + /// - audio: Raw 16-bit PCM audio at 16Hz, used to update the model on the client's + /// conversation. + public func sendAudioRealtime(_ audio: Data) async { + // TODO: (b/443984790) address when we add RealtimeInputConfig support + let message = BidiGenerateContentRealtimeInput( + audio: InlineData(data: audio, mimeType: "audio/pcm") + ) + await service.send(.realtimeInput(message)) + } + + /// Sends a video input stream to the model, using the realtime API. + /// + /// - Parameters: + /// - video: Encoded video data, used to update the model on the client's conversation. + /// - format: The format that the video was encoded in (eg; `mp4`, `webm`, `wmv`, etc.,). + // TODO: (b/448671945) Make public after testing and next release + func sendVideoRealtime(_ video: Data, format: String) async { + let message = BidiGenerateContentRealtimeInput( + video: InlineData(data: video, mimeType: "video/\(format)") + ) + await service.send(.realtimeInput(message)) + } + + /// Sends a text input stream to the model, using the realtime API. + /// + /// - Parameters: + /// - text: Text content to append to the current client's conversation. + public func sendTextRealtime(_ text: String) async { + let message = BidiGenerateContentRealtimeInput(text: text) + await service.send(.realtimeInput(message)) + } + + /// Incremental update of the current conversation. + /// + /// The content is unconditionally appended to the conversation history and used as part of the + /// prompt to the model to generate content. + /// + /// Sending this message will also cause an interruption, if the server is actively generating + /// content. + /// + /// - Parameters: + /// - content: Content to append to the current conversation with the model. + /// - turnComplete: Whether the server should start generating content with the currently + /// accumulated prompt, or await additional messages before starting generation. By default, + /// the server will await additional messages. + public func sendContent(_ content: [ModelContent], turnComplete: Bool = false) async { + let message = BidiGenerateContentClientContent(turns: content, turnComplete: turnComplete) + await service.send(.clientContent(message)) + } + + /// Incremental update of the current conversation. + /// + /// The content is unconditionally appended to the conversation history and used as part of the + /// prompt to the model to generate content. + /// + /// Sending this message will also cause an interruption, if the server is actively generating + /// content. + /// + /// - Parameters: + /// - content: Content to append to the current conversation with the model (see + /// ``PartsRepresentable`` for conforming types). + /// - turnComplete: Whether the server should start generating content with the currently + /// accumulated prompt, or await additional messages before starting generation. By default, + /// the server will await additional messages. + public func sendContent(_ parts: any PartsRepresentable..., + turnComplete: Bool = false) async { + await sendContent([ModelContent(parts: parts)], turnComplete: turnComplete) + } + + /// Permanently stop the conversation with the model, and close the connection to the server + /// + /// This method will be called automatically when the ``LiveSession`` is deinitialized, but this + /// method can be called manually to explicitly end the session. + /// + /// Attempting to receive content from a closed session will cause a + /// ``LiveSessionUnexpectedClosureError`` error to be thrown. + public func close() async { + await service.close() + } + + // TODO: b(445716402) Add a start method when we support session resumption +} diff --git a/FirebaseAI/Sources/Types/Public/Live/LiveSessionErrors.swift b/FirebaseAI/Sources/Types/Public/Live/LiveSessionErrors.swift new file mode 100644 index 00000000000..90b7ab84476 --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/LiveSessionErrors.swift @@ -0,0 +1,102 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// The model sent a message that the SDK failed to parse. +/// +/// This may indicate that the SDK version needs updating, a model is too old for the current SDK +/// version, or that the model is just +/// not supported. +/// +/// Check the `NSUnderlyingErrorKey` entry in ``errorUserInfo`` for the error that caused this. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +public struct LiveSessionUnsupportedMessageError: Error, Sendable, CustomNSError { + let underlyingError: Error + + init(underlyingError: Error) { + self.underlyingError = underlyingError + } + + public var errorUserInfo: [String: Any] { + [ + NSLocalizedDescriptionKey: "Failed to parse a live message from the model. Cause: \(underlyingError.localizedDescription)", + NSUnderlyingErrorKey: underlyingError, + ] + } +} + +/// The live session was closed, because the network connection was lost. +/// +/// Check the `NSUnderlyingErrorKey` entry in ``errorUserInfo`` for the error that caused this. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +public struct LiveSessionLostConnectionError: Error, Sendable, CustomNSError { + let underlyingError: Error + + init(underlyingError: Error) { + self.underlyingError = underlyingError + } + + public var errorUserInfo: [String: Any] { + [ + NSLocalizedDescriptionKey: "The live session lost connection to the server. Cause: \(underlyingError.localizedDescription)", + NSUnderlyingErrorKey: underlyingError, + ] + } +} + +/// The live session was closed, but not for a reason the SDK expected. +/// +/// Check the `NSUnderlyingErrorKey` entry in ``errorUserInfo`` for the error that caused this. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +public struct LiveSessionUnexpectedClosureError: Error, Sendable, CustomNSError { + let underlyingError: WebSocketClosedError + + init(underlyingError: WebSocketClosedError) { + self.underlyingError = underlyingError + } + + public var errorUserInfo: [String: Any] { + [ + NSLocalizedDescriptionKey: "The live session was closed for some unexpected reason. Cause: \(underlyingError.localizedDescription)", + NSUnderlyingErrorKey: underlyingError, + ] + } +} + +/// The model refused our request to setup a live session. +/// +/// This can occur due to the model not supporting the requested response modalities, the project +/// not having access to the model, the model being invalid, or some internal error. +/// +/// Check the `NSUnderlyingErrorKey` entry in ``errorUserInfo`` for the error that caused this. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +public struct LiveSessionSetupError: Error, Sendable, CustomNSError { + let underlyingError: Error + + init(underlyingError: Error) { + self.underlyingError = underlyingError + } + + public var errorUserInfo: [String: Any] { + [ + NSLocalizedDescriptionKey: "The model did not accept the live session request. Reason: \(underlyingError.localizedDescription)", + NSUnderlyingErrorKey: underlyingError, + ] + } +} diff --git a/FirebaseAI/Sources/Types/Public/Live/SpeechConfig.swift b/FirebaseAI/Sources/Types/Public/Live/SpeechConfig.swift new file mode 100644 index 00000000000..67f4799f6e4 --- /dev/null +++ b/FirebaseAI/Sources/Types/Public/Live/SpeechConfig.swift @@ -0,0 +1,47 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import Foundation + +/// Configuration for controlling the voice of the model during conversation. +@available(iOS 15.0, macOS 12.0, macCatalyst 15.0, tvOS 15.0, *) +@available(watchOS, unavailable) +public struct SpeechConfig: Sendable { + let speechConfig: BidiSpeechConfig + + init(_ speechConfig: BidiSpeechConfig) { + self.speechConfig = speechConfig + } + + /// Creates a new `LiveSpeechConfig` value. + /// + /// - Parameters: + /// - voiceName: The name of the prebuilt voice to be used for the model's speech response. + /// + /// To learn more about the available voices, see the docs on + /// [Voice options](https://ai.google.dev/gemini-api/docs/speech-generation#voices)\. + /// - languageCode: ISO-639 language code to use when parsing text sent from the client, instead + /// of audio. By default, the model will attempt to detect the input language automatically. + /// + /// To learn which codes are supported, see the docs on + /// [Supported languages](https://ai.google.dev/gemini-api/docs/speech-generation#languages)\. + public init(voiceName: String, languageCode: String? = nil) { + self.init( + BidiSpeechConfig( + voiceConfig: .prebuiltVoiceConfig(.init(voiceName: voiceName)), + languageCode: languageCode + ) + ) + } +} diff --git a/FirebaseAI/Sources/Types/Public/Part.swift b/FirebaseAI/Sources/Types/Public/Part.swift index e0015901d61..8acf7b12e9a 100644 --- a/FirebaseAI/Sources/Types/Public/Part.swift +++ b/FirebaseAI/Sources/Types/Public/Part.swift @@ -147,6 +147,11 @@ public struct FunctionCallPart: Part { public var isThought: Bool { _isThought ?? false } + /// Unique id of the function call. + /// + /// If present, the returned ``FunctionResponsePart`` should have a matching `functionId` field. + public var functionId: String? { functionCall.id } + /// Constructs a new function call part. /// /// > Note: A `FunctionCallPart` is typically received from the model, rather than created @@ -156,10 +161,24 @@ public struct FunctionCallPart: Part { /// - name: The name of the function to call. /// - args: The function parameters and values. public init(name: String, args: JSONObject) { - self.init(FunctionCall(name: name, args: args), isThought: nil, thoughtSignature: nil) + self.init(FunctionCall(name: name, args: args, id: nil), isThought: nil, thoughtSignature: nil) + } + + /// Constructs a new function call part. + /// + /// > Note: A `FunctionCallPart` is typically received from the model, rather than created + /// manually. + /// + /// - Parameters: + /// - name: The name of the function to call. + /// - args: The function parameters and values. + /// - id: Unique id of the function call. If present, the returned ``FunctionResponsePart`` + /// should have a matching `id` field. + public init(name: String, args: JSONObject, id: String? = nil) { + self.init(FunctionCall(name: name, args: args, id: id), isThought: nil, thoughtSignature: nil) } - init(_ functionCall: FunctionCall, isThought: Bool?, thoughtSignature: String?) { + init(_ functionCall: FunctionCall, isThought: Bool? = nil, thoughtSignature: String? = nil) { self.functionCall = functionCall _isThought = isThought self.thoughtSignature = thoughtSignature @@ -177,6 +196,9 @@ public struct FunctionResponsePart: Part { let _isThought: Bool? let thoughtSignature: String? + /// Matching `id` for a ``FunctionCallPart``, if one was provided. + public var functionId: String? { functionResponse.id } + /// The name of the function that was called. public var name: String { functionResponse.name } @@ -196,6 +218,20 @@ public struct FunctionResponsePart: Part { ) } + /// Constructs a new `FunctionResponse`. + /// + /// - Parameters: + /// - name: The name of the function that was called. + /// - response: The function's response. + /// - functionId: Matching `functionId` for a ``FunctionCallPart``, if one was provided. + public init(name: String, response: JSONObject, functionId: String? = nil) { + self.init( + FunctionResponse(name: name, response: response, id: functionId), + isThought: nil, + thoughtSignature: nil + ) + } + init(_ functionResponse: FunctionResponse, isThought: Bool?, thoughtSignature: String?) { self.functionResponse = functionResponse _isThought = isThought diff --git a/FirebaseAI/Sources/Types/Public/ResponseModality.swift b/FirebaseAI/Sources/Types/Public/ResponseModality.swift index 442fed5f434..576046aa834 100644 --- a/FirebaseAI/Sources/Types/Public/ResponseModality.swift +++ b/FirebaseAI/Sources/Types/Public/ResponseModality.swift @@ -28,6 +28,7 @@ public struct ResponseModality: EncodableProtoEnum, Sendable { enum Kind: String { case text = "TEXT" case image = "IMAGE" + case audio = "AUDIO" } /// Specifies that the model should generate textual content. @@ -48,5 +49,18 @@ public struct ResponseModality: EncodableProtoEnum, Sendable { /// > backwards-incompatible ways. public static let image = ResponseModality(kind: .image) + /// **Public Preview**: Specifies that the model should generate audio content. + /// + /// Use this modality when you need the model to produce (spoken) audio responses based on the + /// provided input or prompts. + /// + /// > Warning: This is currently **only** supported via the + /// > [live api](https://firebase.google.com/docs/ai-logic/live-api)\. + /// > + /// > Furthermore, using the Firebase AI Logic SDKs with the Gemini Live API is in Public Preview, + /// > which means that the feature is not subject to any SLA or deprecation policy and could + /// > change in backwards-incompatible ways. + public static let audio = ResponseModality(kind: .audio) + let rawValue: String }