From 2490cefa3b668c937145ac419ddd877b64ccc74c Mon Sep 17 00:00:00 2001 From: crc-32 Date: Sun, 10 Nov 2024 15:55:12 +0000 Subject: [PATCH] clean up recognizer a little, fix deadlock --- .../speechrecognizer/RecognitionLanguage.kt | 6 + .../speechrecognizer/RecognitionSupport.kt | 20 +- .../SpeechRecognizerDictationService.kt | 54 ++++-- .../shared/handlers/VoiceSessionHandler.kt | 171 +++++++++++------- 4 files changed, 152 insertions(+), 99 deletions(-) create mode 100644 android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionLanguage.kt diff --git a/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionLanguage.kt b/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionLanguage.kt new file mode 100644 index 00000000..5272ef0f --- /dev/null +++ b/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionLanguage.kt @@ -0,0 +1,6 @@ +package io.rebble.cobble.shared.domain.voice.speechrecognizer + +data class RecognitionLanguage( + val tag: String, + val downloaded: Boolean, +) diff --git a/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionSupport.kt b/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionSupport.kt index 83ecbe76..2eead7b8 100644 --- a/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionSupport.kt +++ b/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionSupport.kt @@ -7,21 +7,14 @@ import android.speech.RecognitionSupportCallback import android.speech.SpeechRecognizer import androidx.annotation.RequiresApi import androidx.compose.ui.text.intl.Locale +import io.rebble.cobble.shared.Logging import kotlinx.coroutines.CompletableDeferred import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.asExecutor -enum class RecognitionSupportResult { - SupportedOnDevice, - SupportedOnline, - NeedsDownload, - Unsupported -} - @RequiresApi(VERSION_CODES.TIRAMISU) -suspend fun SpeechRecognizer.checkRecognitionSupport(intent: Intent): RecognitionSupportResult { +suspend fun SpeechRecognizer.checkRecognitionSupport(intent: Intent): RecognitionSupport { val result = CompletableDeferred() - val language = Locale.current.toLanguageTag() val executor = Dispatchers.IO.asExecutor() checkRecognitionSupport(intent, executor, object : RecognitionSupportCallback { override fun onSupportResult(recognitionSupport: RecognitionSupport) { @@ -34,11 +27,6 @@ suspend fun SpeechRecognizer.checkRecognitionSupport(intent: Intent): Recognitio } }) val support = result.await() - return when { - support.supportedOnDeviceLanguages.contains(language) -> RecognitionSupportResult.SupportedOnDevice - support.installedOnDeviceLanguages.contains(language) -> RecognitionSupportResult.SupportedOnDevice - support.onlineLanguages.contains(language) -> RecognitionSupportResult.SupportedOnline - support.pendingOnDeviceLanguages.contains(language) -> RecognitionSupportResult.NeedsDownload - else -> RecognitionSupportResult.Unsupported - } + Logging.d("Locale: ${Locale.current.toLanguageTag()}, Recognition support: $support") + return support } \ No newline at end of file diff --git a/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/SpeechRecognizerDictationService.kt b/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/SpeechRecognizerDictationService.kt index 1f3eb2db..35d5f564 100644 --- a/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/SpeechRecognizerDictationService.kt +++ b/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/SpeechRecognizerDictationService.kt @@ -7,6 +7,7 @@ import android.media.AudioTrack import android.os.Build.VERSION_CODES import android.os.Bundle import android.os.ParcelFileDescriptor +import android.os.ParcelFileDescriptor.AutoCloseOutputStream import android.speech.* import androidx.annotation.RequiresApi import androidx.compose.ui.text.intl.Locale @@ -46,7 +47,7 @@ class SpeechRecognizerDictationService: DictationService, KoinComponent { companion object { private val AUDIO_LATENCY = 600.milliseconds - fun buildRecognizerIntent(audioSource: ParcelFileDescriptor? = null, encoding: Int = AudioFormat.ENCODING_PCM_16BIT, sampleRate: Int = 16000) = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply { + fun buildRecognizerIntent(audioSource: ParcelFileDescriptor? = null, encoding: Int = AudioFormat.ENCODING_PCM_16BIT, sampleRate: Int = 16000, language: String? = null) = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply { putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM) audioSource?.let { putExtra(RecognizerIntent.EXTRA_AUDIO_SOURCE, audioSource) @@ -54,7 +55,9 @@ class SpeechRecognizerDictationService: DictationService, KoinComponent { putExtra(RecognizerIntent.EXTRA_AUDIO_SOURCE_CHANNEL_COUNT, 1) putExtra(RecognizerIntent.EXTRA_AUDIO_SOURCE_SAMPLING_RATE, sampleRate) } - putExtra(RecognizerIntent.EXTRA_LANGUAGE, Locale.current.toLanguageTag()) + language?.let { + putExtra(RecognizerIntent.EXTRA_LANGUAGE, language) + } } } @@ -115,8 +118,30 @@ class SpeechRecognizerDictationService: DictationService, KoinComponent { } }.flowOn(Dispatchers.Main) + private suspend fun SpeechRecognizer.getBestRecognitionLanguage(recognizerIntent: Intent): RecognitionLanguage? { + val support = withContext(Dispatchers.Main) { + this@getBestRecognitionLanguage.checkRecognitionSupport(recognizerIntent) + } + val locale = Locale.current.toLanguageTag() + val installedBest = support.installedOnDeviceLanguages.firstOrNull { locale.startsWith(it) } + val availableBest = support.supportedOnDeviceLanguages.firstOrNull { locale.startsWith(it) } + return when { + installedBest != null -> RecognitionLanguage(installedBest, true) + availableBest != null -> RecognitionLanguage(availableBest, false) + else -> null + } + } + + private fun createRecognizerPipes(): Pair { + val recognizerPipes = ParcelFileDescriptor.createSocketPair() + val recognizerReadPipe = recognizerPipes[0] + val recognizerWritePipe = AutoCloseOutputStream(recognizerPipes[1]) + return recognizerReadPipe to recognizerWritePipe + } + override fun handleSpeechStream(speexEncoderInfo: SpeexEncoderInfo, audioStreamFrames: Flow) = flow { - if (!SpeechRecognizer.isRecognitionAvailable(context)) { + if (!SpeechRecognizer.isOnDeviceRecognitionAvailable(context)) { + Logging.e("Offline speech recognition not available") emit(DictationServiceResponse.Error(Result.FailServiceUnavailable)) return@flow } @@ -124,25 +149,24 @@ class SpeechRecognizerDictationService: DictationService, KoinComponent { val decodeBufLength = Short.SIZE_BYTES * speexEncoderInfo.frameSize val decodedBuf = ByteBuffer.allocateDirect(decodeBufLength) decodedBuf.order(ByteOrder.nativeOrder()) - val recognizerPipes = ParcelFileDescriptor.createSocketPair() - val recognizerReadPipe = recognizerPipes[0] - val recognizerWritePipe = ParcelFileDescriptor.AutoCloseOutputStream(recognizerPipes[1]) - val recognizerIntent = buildRecognizerIntent(recognizerReadPipe, AudioFormat.ENCODING_PCM_16BIT, speexEncoderInfo.sampleRate.toInt()) - //val recognizerIntent = buildRecognizerIntent() + + val (recognizerReadPipe, recognizerWritePipe) = createRecognizerPipes() val speechRecognizer = withContext(Dispatchers.Main) { SpeechRecognizer.createOnDeviceSpeechRecognizer(context) } - val supported = withContext(Dispatchers.Main) { - speechRecognizer.checkRecognitionSupport(recognizerIntent) + val recognizerIntent = buildRecognizerIntent(recognizerReadPipe, AudioFormat.ENCODING_PCM_16BIT, speexEncoderInfo.sampleRate.toInt()) + val recognitionLanguage = speechRecognizer.getBestRecognitionLanguage(recognizerIntent) + if (recognitionLanguage == null) { + Logging.e("No recognition language available") + emit(DictationServiceResponse.Error(Result.FailServiceUnavailable)) + return@flow } - - //TODO: handle downloads, etc - Logging.d("Recognition support: $supported") - if (supported == RecognitionSupportResult.Unsupported) { - Logging.e("Speech recognition language/type not supported") + if (!recognitionLanguage.downloaded) { + Logging.e("Recognition language not downloaded: ${recognitionLanguage.tag}") emit(DictationServiceResponse.Error(Result.FailServiceUnavailable)) return@flow } + recognizerIntent.putExtra(RecognizerIntent.EXTRA_LANGUAGE, recognitionLanguage.tag) //audioTrack.play() val audioJob = scope.launch { diff --git a/android/shared/src/commonMain/kotlin/io/rebble/cobble/shared/handlers/VoiceSessionHandler.kt b/android/shared/src/commonMain/kotlin/io/rebble/cobble/shared/handlers/VoiceSessionHandler.kt index cb8b8cde..4696b750 100644 --- a/android/shared/src/commonMain/kotlin/io/rebble/cobble/shared/handlers/VoiceSessionHandler.kt +++ b/android/shared/src/commonMain/kotlin/io/rebble/cobble/shared/handlers/VoiceSessionHandler.kt @@ -8,10 +8,13 @@ import io.rebble.cobble.shared.domain.voice.SpeexEncoderInfo import io.rebble.cobble.shared.domain.voice.VoiceSession import io.rebble.libpebblecommon.packets.* import io.rebble.libpebblecommon.util.DataBuffer +import kotlinx.coroutines.coroutineScope import kotlinx.coroutines.flow.* import kotlinx.coroutines.launch +import kotlinx.coroutines.withTimeout import org.koin.core.component.KoinComponent import org.koin.core.component.inject +import kotlin.time.Duration.Companion.minutes class VoiceSessionHandler( private val pebbleDevice: PebbleDevice, @@ -44,6 +47,103 @@ class VoiceSessionHandler( ) } + private suspend fun handleSpeechStream(voiceSession: VoiceSession) { + val appInitiated = voiceSession.appUuid != null + var sentReady = false + voiceSession.recognizer.handleSpeechStream(voiceSession.encoderInfo, voiceSession.audioStreamFrames) + .takeWhile { it !is DictationServiceResponse.Complete } + .onEach { + Logging.v("DictationServiceResponse: $it") + withTimeout(1.minutes) { + when (it) { + is DictationServiceResponse.Ready -> { + pebbleDevice.activeVoiceSession.value = voiceSession + val result = SessionSetupResult( + sessionType = SessionType.Dictation, + result = Result.Success + ) + if (appInitiated) { + result.flags.set(1u) + } + pebbleDevice.voiceService.send(result) + sentReady = true + } + is DictationServiceResponse.Error -> { + val result = if (sentReady) { + DictationResult( + voiceSession.sessionId.toUShort(), + it.result, + buildList { + if (appInitiated && voiceSession.appUuid != null) { + add(VoiceAttribute.AppUuid().apply { + uuid.set(voiceSession.appUuid) + }) + } + } + ) + } else { + SessionSetupResult( + sessionType = SessionType.Dictation, + result = it.result + ) + } + if (appInitiated) { + result.flags.set(1u) + } + pebbleDevice.voiceService.send(result) + } + is DictationServiceResponse.Transcription -> { + val resp = DictationResult( + voiceSession.sessionId.toUShort(), + Result.Success, + buildList { + add(makeTranscription(it.sentences)) + if (appInitiated && voiceSession.appUuid != null) { + add(VoiceAttribute( + id = VoiceAttributeType.AppUuid.value, + content = VoiceAttribute.AppUuid().apply { + uuid.set(voiceSession.appUuid) + } + )) + } + } + ) + if (appInitiated) { + resp.flags.set(1u) + } + pebbleDevice.voiceService.send(resp) + } + } + } + } + .catch { + Logging.e("Error in voice session: $it") + val result = if (sentReady) { + DictationResult( + voiceSession.sessionId.toUShort(), + Result.FailRecognizerError, + buildList { + if (appInitiated && voiceSession.appUuid != null) { + add(VoiceAttribute.AppUuid().apply { + uuid.set(voiceSession.appUuid) + }) + } + } + ) + } else { + SessionSetupResult( + sessionType = SessionType.Dictation, + result = Result.FailRecognizerError + ) + } + if (appInitiated) { + result.flags.set(1u) + } + pebbleDevice.voiceService.send(result) + } + .collect() + } + private suspend fun listenForVoiceSessions() { for (message in pebbleDevice.voiceService.receivedMessages) { when (message) { @@ -70,74 +170,9 @@ class VoiceSessionHandler( val dictationService: DictationService by inject() val voiceSession = VoiceSession(appUuid, message.sessionId.get().toInt(), encoderInfo, dictationService) Logging.d("Received voice session: $voiceSession") - - var sentReady = false - dictationService.handleSpeechStream(voiceSession.encoderInfo, voiceSession.audioStreamFrames) - .takeWhile { it !is DictationServiceResponse.Complete } - .onEach { - Logging.v("DictationServiceResponse: $it") - } - .collect { - when (it) { - is DictationServiceResponse.Ready -> { - pebbleDevice.activeVoiceSession.value = voiceSession - val result = SessionSetupResult( - sessionType = SessionType.Dictation, - result = Result.Success - ) - if (appInitiated) { - result.flags.set(1u) - } - pebbleDevice.voiceService.send(result) - sentReady = true - } - is DictationServiceResponse.Error -> { - val result = if (sentReady) { - DictationResult( - voiceSession.sessionId.toUShort(), - it.result, - buildList { - if (appInitiated && voiceSession.appUuid != null) { - add(VoiceAttribute.AppUuid().apply { - uuid.set(voiceSession.appUuid) - }) - } - } - ) - } else { - SessionSetupResult( - sessionType = SessionType.Dictation, - result = it.result - ) - } - if (appInitiated) { - result.flags.set(1u) - } - pebbleDevice.voiceService.send(result) - } - is DictationServiceResponse.Transcription -> { - val resp = DictationResult( - voiceSession.sessionId.toUShort(), - Result.Success, - buildList { - add(makeTranscription(it.sentences)) - if (appInitiated && voiceSession.appUuid != null) { - add(VoiceAttribute( - id = VoiceAttributeType.AppUuid.value, - content = VoiceAttribute.AppUuid().apply { - uuid.set(voiceSession.appUuid) - } - )) - } - } - ) - if (appInitiated) { - resp.flags.set(1u) - } - pebbleDevice.voiceService.send(resp) - } - } - } + coroutineScope { + launch { handleSpeechStream(voiceSession) } + } } }