Skip to content

Commit e0e995a

Browse files
VinayGuthalrlazodaymxn
authored
Live API Changes: Better Scheduling and Louder Output (#7481)
This pr does the following - Uses delay instead of yield for better scheduling. - Uses CONTENT_TYPE_SPEECH instead of USAGE_VOICE_COMMUNICATION for louder model voice. - Launches audio recording, audio playing and getting model response in 3 separate threads. --------- Co-authored-by: Rodrigo Lazo <rlazo@users.noreply.github.com> Co-authored-by: Daymon <daymxn@google.com> Co-authored-by: Daymon <17409137+daymxn@users.noreply.github.com>
1 parent 089f0a1 commit e0e995a

File tree

4 files changed

+96
-24
lines changed

4 files changed

+96
-24
lines changed

firebase-ai/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# Unreleased
22

3+
- [changed] Added better scheduling and louder output for Live API.
4+
- [changed] Added support for input and output transcription. (#7482)
35
- [feature] Added support for sending realtime audio and video in a `LiveSession`.
46
- [changed] Removed redundant internal exception types. (#7475)
57

firebase-ai/src/main/kotlin/com/google/firebase/ai/common/util/android.kt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,8 @@
1717
package com.google.firebase.ai.common.util
1818

1919
import android.media.AudioRecord
20-
import kotlin.time.Duration.Companion.milliseconds
2120
import kotlinx.coroutines.delay
2221
import kotlinx.coroutines.flow.flow
23-
import kotlinx.coroutines.yield
2422

2523
/**
2624
* The minimum buffer size for this instance.
@@ -40,15 +38,17 @@ internal fun AudioRecord.readAsFlow() = flow {
4038

4139
while (true) {
4240
if (recordingState != AudioRecord.RECORDSTATE_RECORDING) {
43-
// TODO(vguthal): Investigate if both yield and delay are required.
44-
delay(10.milliseconds)
45-
yield()
41+
// delay uses a different scheduler in the backend, so it's "stickier" in its enforcement when
42+
// compared to yield.
43+
delay(0)
4644
continue
4745
}
4846
val bytesRead = read(buffer, 0, buffer.size)
4947
if (bytesRead > 0) {
5048
emit(buffer.copyOf(bytesRead))
5149
}
52-
yield()
50+
// delay uses a different scheduler in the backend, so it's "stickier" in its enforcement when
51+
// compared to yield.
52+
delay(0)
5353
}
5454
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/AudioHelper.kt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,10 @@ internal class AudioHelper(
162162
fun build(): AudioHelper {
163163
val playbackTrack =
164164
AudioTrack(
165-
AudioAttributes.Builder().setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION).build(),
165+
AudioAttributes.Builder()
166+
.setUsage(AudioAttributes.USAGE_MEDIA)
167+
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
168+
.build(),
166169
AudioFormat.Builder()
167170
.setSampleRate(24000)
168171
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt

Lines changed: 84 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,17 @@
1717
package com.google.firebase.ai.type
1818

1919
import android.Manifest.permission.RECORD_AUDIO
20+
import android.annotation.SuppressLint
2021
import android.content.pm.PackageManager
2122
import android.media.AudioFormat
2223
import android.media.AudioTrack
24+
import android.os.Process
25+
import android.os.StrictMode
26+
import android.os.StrictMode.ThreadPolicy
2327
import android.util.Log
2428
import androidx.annotation.RequiresPermission
2529
import androidx.core.content.ContextCompat
30+
import com.google.firebase.BuildConfig
2631
import com.google.firebase.FirebaseApp
2732
import com.google.firebase.ai.common.JSON
2833
import com.google.firebase.ai.common.util.CancelledCoroutineScope
@@ -34,21 +39,27 @@ import io.ktor.websocket.Frame
3439
import io.ktor.websocket.close
3540
import io.ktor.websocket.readBytes
3641
import java.util.concurrent.ConcurrentLinkedQueue
42+
import java.util.concurrent.Executors
43+
import java.util.concurrent.ThreadFactory
3744
import java.util.concurrent.atomic.AtomicBoolean
45+
import java.util.concurrent.atomic.AtomicLong
3846
import kotlin.coroutines.CoroutineContext
47+
import kotlinx.coroutines.CoroutineName
3948
import kotlinx.coroutines.CoroutineScope
49+
import kotlinx.coroutines.asCoroutineDispatcher
4050
import kotlinx.coroutines.cancel
4151
import kotlinx.coroutines.channels.Channel.Factory.UNLIMITED
52+
import kotlinx.coroutines.delay
4253
import kotlinx.coroutines.flow.Flow
4354
import kotlinx.coroutines.flow.buffer
4455
import kotlinx.coroutines.flow.catch
4556
import kotlinx.coroutines.flow.flow
57+
import kotlinx.coroutines.flow.flowOn
4658
import kotlinx.coroutines.flow.launchIn
4759
import kotlinx.coroutines.flow.onCompletion
4860
import kotlinx.coroutines.flow.onEach
4961
import kotlinx.coroutines.isActive
5062
import kotlinx.coroutines.launch
51-
import kotlinx.coroutines.yield
5263
import kotlinx.serialization.ExperimentalSerializationApi
5364
import kotlinx.serialization.Serializable
5465
import kotlinx.serialization.encodeToString
@@ -65,11 +76,21 @@ internal constructor(
6576
private val firebaseApp: FirebaseApp,
6677
) {
6778
/**
68-
* Coroutine scope that we batch data on for [startAudioConversation].
79+
* Coroutine scope that we batch data on for network related behavior.
6980
*
7081
* Makes it easy to stop all the work with [stopAudioConversation] by just cancelling the scope.
7182
*/
72-
private var scope = CancelledCoroutineScope
83+
private var networkScope = CancelledCoroutineScope
84+
85+
/**
86+
* Coroutine scope that we batch data on for audio recording and playback.
87+
*
88+
* Separate from [networkScope] to ensure interchanging of dispatchers doesn't cause any deadlocks
89+
* or issues.
90+
*
91+
* Makes it easy to stop all the work with [stopAudioConversation] by just cancelling the scope.
92+
*/
93+
private var audioScope = CancelledCoroutineScope
7394

7495
/**
7596
* Playback audio data sent from the model.
@@ -159,16 +180,17 @@ internal constructor(
159180
}
160181

161182
FirebaseAIException.catchAsync {
162-
if (scope.isActive) {
183+
if (networkScope.isActive || audioScope.isActive) {
163184
Log.w(
164185
TAG,
165186
"startAudioConversation called after the recording has already started. " +
166187
"Call stopAudioConversation to close the previous connection."
167188
)
168189
return@catchAsync
169190
}
170-
171-
scope = CoroutineScope(blockingDispatcher + childJob())
191+
networkScope =
192+
CoroutineScope(blockingDispatcher + childJob() + CoroutineName("LiveSession Network"))
193+
audioScope = CoroutineScope(audioDispatcher + childJob() + CoroutineName("LiveSession Audio"))
172194
audioHelper = AudioHelper.build()
173195

174196
recordUserAudio()
@@ -188,7 +210,8 @@ internal constructor(
188210
FirebaseAIException.catch {
189211
if (!startedReceiving.getAndSet(false)) return@catch
190212

191-
scope.cancel()
213+
networkScope.cancel()
214+
audioScope.cancel()
192215
playBackQueue.clear()
193216

194217
audioHelper?.release()
@@ -231,7 +254,9 @@ internal constructor(
231254
)
232255
}
233256
?.let { emit(it.toPublic()) }
234-
yield()
257+
// delay uses a different scheduler in the backend, so it's "stickier" in its
258+
// enforcement when compared to yield.
259+
delay(0)
235260
}
236261
}
237262
.onCompletion { stopAudioConversation() }
@@ -258,7 +283,8 @@ internal constructor(
258283
FirebaseAIException.catch {
259284
if (!startedReceiving.getAndSet(false)) return@catch
260285

261-
scope.cancel()
286+
networkScope.cancel()
287+
audioScope.cancel()
262288
playBackQueue.clear()
263289

264290
audioHelper?.release()
@@ -403,18 +429,24 @@ internal constructor(
403429
audioHelper
404430
?.listenToRecording()
405431
?.buffer(UNLIMITED)
432+
?.flowOn(audioDispatcher)
406433
?.accumulateUntil(MIN_BUFFER_SIZE)
407-
?.onEach { sendAudioRealtime(InlineData(it, "audio/pcm")) }
434+
?.onEach {
435+
sendAudioRealtime(InlineData(it, "audio/pcm"))
436+
// delay uses a different scheduler in the backend, so it's "stickier" in its enforcement
437+
// when compared to yield.
438+
delay(0)
439+
}
408440
?.catch { throw FirebaseAIException.from(it) }
409-
?.launchIn(scope)
441+
?.launchIn(networkScope)
410442
}
411443

412444
/**
413445
* Processes responses from the model during an audio conversation.
414446
*
415447
* Audio messages are added to [playBackQueue].
416448
*
417-
* Launched asynchronously on [scope].
449+
* Launched asynchronously on [networkScope].
418450
*
419451
* @param functionCallHandler A callback function that is invoked whenever the server receives a
420452
* function call.
@@ -471,18 +503,18 @@ internal constructor(
471503
}
472504
}
473505
}
474-
.launchIn(scope)
506+
.launchIn(networkScope)
475507
}
476508

477509
/**
478510
* Listens for playback data from the model and plays the audio.
479511
*
480512
* Polls [playBackQueue] for data, and calls [AudioHelper.playAudio] when data is received.
481513
*
482-
* Launched asynchronously on [scope].
514+
* Launched asynchronously on [networkScope].
483515
*/
484516
private fun listenForModelPlayback(enableInterruptions: Boolean = false) {
485-
scope.launch {
517+
audioScope.launch {
486518
while (isActive) {
487519
val playbackData = playBackQueue.poll()
488520
if (playbackData == null) {
@@ -491,14 +523,16 @@ internal constructor(
491523
if (!enableInterruptions) {
492524
audioHelper?.resumeRecording()
493525
}
494-
yield()
526+
// delay uses a different scheduler in the backend, so it's "stickier" in its enforcement
527+
// when compared to yield.
528+
delay(0)
495529
} else {
496530
/**
497531
* We pause the recording while the model is speaking to avoid interrupting it because of
498532
* no echo cancellation
499533
*/
500534
// TODO(b/408223520): Conditionally pause when param is added
501-
if (enableInterruptions != true) {
535+
if (!enableInterruptions) {
502536
audioHelper?.pauseRecording()
503537
}
504538
audioHelper?.playAudio(playbackData)
@@ -583,5 +617,38 @@ internal constructor(
583617
AudioFormat.CHANNEL_OUT_MONO,
584618
AudioFormat.ENCODING_PCM_16BIT
585619
)
620+
@SuppressLint("ThreadPoolCreation")
621+
val audioDispatcher =
622+
Executors.newCachedThreadPool(AudioThreadFactory()).asCoroutineDispatcher()
623+
}
624+
}
625+
626+
internal class AudioThreadFactory : ThreadFactory {
627+
private val threadCount = AtomicLong()
628+
private val policy: ThreadPolicy = audioPolicy()
629+
630+
override fun newThread(task: Runnable?): Thread? {
631+
val thread =
632+
DEFAULT.newThread {
633+
Process.setThreadPriority(Process.THREAD_PRIORITY_AUDIO)
634+
StrictMode.setThreadPolicy(policy)
635+
task?.run()
636+
}
637+
thread.name = "Firebase Audio Thread #${threadCount.andIncrement}"
638+
return thread
639+
}
640+
641+
companion object {
642+
val DEFAULT: ThreadFactory = Executors.defaultThreadFactory()
643+
644+
private fun audioPolicy(): ThreadPolicy {
645+
val builder = ThreadPolicy.Builder().detectNetwork()
646+
647+
if (BuildConfig.DEBUG) {
648+
builder.penaltyDeath()
649+
}
650+
651+
return builder.penaltyLog().build()
652+
}
586653
}
587654
}

0 commit comments

Comments
 (0)