From 2490cefa3b668c937145ac419ddd877b64ccc74c Mon Sep 17 00:00:00 2001
From: crc-32 <me@crc32.dev>
Date: Sun, 10 Nov 2024 15:55:12 +0000
Subject: [PATCH] clean up recognizer a little, fix deadlock

---
 .../speechrecognizer/RecognitionLanguage.kt   |   6 +
 .../speechrecognizer/RecognitionSupport.kt    |  20 +-
 .../SpeechRecognizerDictationService.kt       |  54 ++++--
 .../shared/handlers/VoiceSessionHandler.kt    | 171 +++++++++++-------
 4 files changed, 152 insertions(+), 99 deletions(-)
 create mode 100644 android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionLanguage.kt

diff --git a/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionLanguage.kt b/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionLanguage.kt
new file mode 100644
index 00000000..5272ef0f
--- /dev/null
+++ b/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionLanguage.kt
@@ -0,0 +1,6 @@
+package io.rebble.cobble.shared.domain.voice.speechrecognizer
+
+data class RecognitionLanguage(
+        val tag: String,
+        val downloaded: Boolean,
+)
diff --git a/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionSupport.kt b/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionSupport.kt
index 83ecbe76..2eead7b8 100644
--- a/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionSupport.kt
+++ b/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/RecognitionSupport.kt
@@ -7,21 +7,14 @@ import android.speech.RecognitionSupportCallback
 import android.speech.SpeechRecognizer
 import androidx.annotation.RequiresApi
 import androidx.compose.ui.text.intl.Locale
+import io.rebble.cobble.shared.Logging
 import kotlinx.coroutines.CompletableDeferred
 import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.asExecutor
 
-enum class RecognitionSupportResult {
-    SupportedOnDevice,
-    SupportedOnline,
-    NeedsDownload,
-    Unsupported
-}
-
 @RequiresApi(VERSION_CODES.TIRAMISU)
-suspend fun SpeechRecognizer.checkRecognitionSupport(intent: Intent): RecognitionSupportResult {
+suspend fun SpeechRecognizer.checkRecognitionSupport(intent: Intent): RecognitionSupport {
     val result = CompletableDeferred<RecognitionSupport>()
-    val language = Locale.current.toLanguageTag()
     val executor = Dispatchers.IO.asExecutor()
     checkRecognitionSupport(intent, executor, object : RecognitionSupportCallback {
         override fun onSupportResult(recognitionSupport: RecognitionSupport) {
@@ -34,11 +27,6 @@ suspend fun SpeechRecognizer.checkRecognitionSupport(intent: Intent): Recognitio
         }
     })
     val support = result.await()
-    return when {
-        support.supportedOnDeviceLanguages.contains(language) -> RecognitionSupportResult.SupportedOnDevice
-        support.installedOnDeviceLanguages.contains(language) -> RecognitionSupportResult.SupportedOnDevice
-        support.onlineLanguages.contains(language) -> RecognitionSupportResult.SupportedOnline
-        support.pendingOnDeviceLanguages.contains(language) -> RecognitionSupportResult.NeedsDownload
-        else -> RecognitionSupportResult.Unsupported
-    }
+    Logging.d("Locale: ${Locale.current.toLanguageTag()}, Recognition support: $support")
+    return support
 }
\ No newline at end of file
diff --git a/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/SpeechRecognizerDictationService.kt b/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/SpeechRecognizerDictationService.kt
index 1f3eb2db..35d5f564 100644
--- a/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/SpeechRecognizerDictationService.kt
+++ b/android/shared/src/androidMain/kotlin/io/rebble/cobble/shared/domain/voice/speechrecognizer/SpeechRecognizerDictationService.kt
@@ -7,6 +7,7 @@ import android.media.AudioTrack
 import android.os.Build.VERSION_CODES
 import android.os.Bundle
 import android.os.ParcelFileDescriptor
+import android.os.ParcelFileDescriptor.AutoCloseOutputStream
 import android.speech.*
 import androidx.annotation.RequiresApi
 import androidx.compose.ui.text.intl.Locale
@@ -46,7 +47,7 @@ class SpeechRecognizerDictationService: DictationService, KoinComponent {
 
     companion object {
         private val AUDIO_LATENCY = 600.milliseconds
-        fun buildRecognizerIntent(audioSource: ParcelFileDescriptor? = null, encoding: Int = AudioFormat.ENCODING_PCM_16BIT, sampleRate: Int = 16000) = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply {
+        fun buildRecognizerIntent(audioSource: ParcelFileDescriptor? = null, encoding: Int = AudioFormat.ENCODING_PCM_16BIT, sampleRate: Int = 16000, language: String? = null) = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply {
             putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM)
             audioSource?.let {
                 putExtra(RecognizerIntent.EXTRA_AUDIO_SOURCE, audioSource)
@@ -54,7 +55,9 @@ class SpeechRecognizerDictationService: DictationService, KoinComponent {
                 putExtra(RecognizerIntent.EXTRA_AUDIO_SOURCE_CHANNEL_COUNT, 1)
                 putExtra(RecognizerIntent.EXTRA_AUDIO_SOURCE_SAMPLING_RATE, sampleRate)
             }
-            putExtra(RecognizerIntent.EXTRA_LANGUAGE, Locale.current.toLanguageTag())
+            language?.let {
+                putExtra(RecognizerIntent.EXTRA_LANGUAGE, language)
+            }
         }
     }
 
@@ -115,8 +118,30 @@ class SpeechRecognizerDictationService: DictationService, KoinComponent {
         }
     }.flowOn(Dispatchers.Main)
 
+    private suspend fun SpeechRecognizer.getBestRecognitionLanguage(recognizerIntent: Intent): RecognitionLanguage? {
+        val support = withContext(Dispatchers.Main) {
+            this@getBestRecognitionLanguage.checkRecognitionSupport(recognizerIntent)
+        }
+        val locale = Locale.current.toLanguageTag()
+        val installedBest = support.installedOnDeviceLanguages.firstOrNull { locale.startsWith(it) }
+        val availableBest = support.supportedOnDeviceLanguages.firstOrNull { locale.startsWith(it) }
+        return when {
+            installedBest != null -> RecognitionLanguage(installedBest, true)
+            availableBest != null -> RecognitionLanguage(availableBest, false)
+            else -> null
+        }
+    }
+
+    private fun createRecognizerPipes(): Pair<ParcelFileDescriptor, AutoCloseOutputStream> {
+        val recognizerPipes = ParcelFileDescriptor.createSocketPair()
+        val recognizerReadPipe = recognizerPipes[0]
+        val recognizerWritePipe = AutoCloseOutputStream(recognizerPipes[1])
+        return recognizerReadPipe to recognizerWritePipe
+    }
+
     override fun handleSpeechStream(speexEncoderInfo: SpeexEncoderInfo, audioStreamFrames: Flow<AudioStreamFrame>) = flow {
-        if (!SpeechRecognizer.isRecognitionAvailable(context)) {
+        if (!SpeechRecognizer.isOnDeviceRecognitionAvailable(context)) {
+            Logging.e("Offline speech recognition not available")
             emit(DictationServiceResponse.Error(Result.FailServiceUnavailable))
             return@flow
         }
@@ -124,25 +149,24 @@ class SpeechRecognizerDictationService: DictationService, KoinComponent {
         val decodeBufLength = Short.SIZE_BYTES * speexEncoderInfo.frameSize
         val decodedBuf = ByteBuffer.allocateDirect(decodeBufLength)
         decodedBuf.order(ByteOrder.nativeOrder())
-        val recognizerPipes = ParcelFileDescriptor.createSocketPair()
-        val recognizerReadPipe = recognizerPipes[0]
-        val recognizerWritePipe = ParcelFileDescriptor.AutoCloseOutputStream(recognizerPipes[1])
-        val recognizerIntent = buildRecognizerIntent(recognizerReadPipe, AudioFormat.ENCODING_PCM_16BIT, speexEncoderInfo.sampleRate.toInt())
-        //val recognizerIntent = buildRecognizerIntent()
+
+        val (recognizerReadPipe, recognizerWritePipe) = createRecognizerPipes()
         val speechRecognizer = withContext(Dispatchers.Main) {
             SpeechRecognizer.createOnDeviceSpeechRecognizer(context)
         }
-        val supported = withContext(Dispatchers.Main) {
-            speechRecognizer.checkRecognitionSupport(recognizerIntent)
+        val recognizerIntent = buildRecognizerIntent(recognizerReadPipe, AudioFormat.ENCODING_PCM_16BIT, speexEncoderInfo.sampleRate.toInt())
+        val recognitionLanguage = speechRecognizer.getBestRecognitionLanguage(recognizerIntent)
+        if (recognitionLanguage == null) {
+            Logging.e("No recognition language available")
+            emit(DictationServiceResponse.Error(Result.FailServiceUnavailable))
+            return@flow
         }
-
-        //TODO: handle downloads, etc
-        Logging.d("Recognition support: $supported")
-        if (supported == RecognitionSupportResult.Unsupported) {
-            Logging.e("Speech recognition language/type not supported")
+        if (!recognitionLanguage.downloaded) {
+            Logging.e("Recognition language not downloaded: ${recognitionLanguage.tag}")
             emit(DictationServiceResponse.Error(Result.FailServiceUnavailable))
             return@flow
         }
+        recognizerIntent.putExtra(RecognizerIntent.EXTRA_LANGUAGE, recognitionLanguage.tag)
         //audioTrack.play()
 
         val audioJob = scope.launch {
diff --git a/android/shared/src/commonMain/kotlin/io/rebble/cobble/shared/handlers/VoiceSessionHandler.kt b/android/shared/src/commonMain/kotlin/io/rebble/cobble/shared/handlers/VoiceSessionHandler.kt
index cb8b8cde..4696b750 100644
--- a/android/shared/src/commonMain/kotlin/io/rebble/cobble/shared/handlers/VoiceSessionHandler.kt
+++ b/android/shared/src/commonMain/kotlin/io/rebble/cobble/shared/handlers/VoiceSessionHandler.kt
@@ -8,10 +8,13 @@ import io.rebble.cobble.shared.domain.voice.SpeexEncoderInfo
 import io.rebble.cobble.shared.domain.voice.VoiceSession
 import io.rebble.libpebblecommon.packets.*
 import io.rebble.libpebblecommon.util.DataBuffer
+import kotlinx.coroutines.coroutineScope
 import kotlinx.coroutines.flow.*
 import kotlinx.coroutines.launch
+import kotlinx.coroutines.withTimeout
 import org.koin.core.component.KoinComponent
 import org.koin.core.component.inject
+import kotlin.time.Duration.Companion.minutes
 
 class VoiceSessionHandler(
         private val pebbleDevice: PebbleDevice,
@@ -44,6 +47,103 @@ class VoiceSessionHandler(
         )
     }
 
+    private suspend fun handleSpeechStream(voiceSession: VoiceSession) {
+        val appInitiated = voiceSession.appUuid != null
+        var sentReady = false
+        voiceSession.recognizer.handleSpeechStream(voiceSession.encoderInfo, voiceSession.audioStreamFrames)
+                .takeWhile { it !is DictationServiceResponse.Complete }
+                .onEach {
+                    Logging.v("DictationServiceResponse: $it")
+                    withTimeout(1.minutes) {
+                        when (it) {
+                            is DictationServiceResponse.Ready -> {
+                                pebbleDevice.activeVoiceSession.value = voiceSession
+                                val result = SessionSetupResult(
+                                        sessionType = SessionType.Dictation,
+                                        result = Result.Success
+                                )
+                                if (appInitiated) {
+                                    result.flags.set(1u)
+                                }
+                                pebbleDevice.voiceService.send(result)
+                                sentReady = true
+                            }
+                            is DictationServiceResponse.Error -> {
+                                val result = if (sentReady) {
+                                    DictationResult(
+                                            voiceSession.sessionId.toUShort(),
+                                            it.result,
+                                            buildList {
+                                                if (appInitiated && voiceSession.appUuid != null) {
+                                                    add(VoiceAttribute.AppUuid().apply {
+                                                        uuid.set(voiceSession.appUuid)
+                                                    })
+                                                }
+                                            }
+                                    )
+                                } else {
+                                    SessionSetupResult(
+                                            sessionType = SessionType.Dictation,
+                                            result = it.result
+                                    )
+                                }
+                                if (appInitiated) {
+                                    result.flags.set(1u)
+                                }
+                                pebbleDevice.voiceService.send(result)
+                            }
+                            is DictationServiceResponse.Transcription -> {
+                                val resp = DictationResult(
+                                        voiceSession.sessionId.toUShort(),
+                                        Result.Success,
+                                        buildList {
+                                            add(makeTranscription(it.sentences))
+                                            if (appInitiated && voiceSession.appUuid != null) {
+                                                add(VoiceAttribute(
+                                                        id = VoiceAttributeType.AppUuid.value,
+                                                        content = VoiceAttribute.AppUuid().apply {
+                                                            uuid.set(voiceSession.appUuid)
+                                                        }
+                                                ))
+                                            }
+                                        }
+                                )
+                                if (appInitiated) {
+                                    resp.flags.set(1u)
+                                }
+                                pebbleDevice.voiceService.send(resp)
+                            }
+                        }
+                    }
+                }
+                .catch {
+                    Logging.e("Error in voice session: $it")
+                    val result = if (sentReady) {
+                        DictationResult(
+                                voiceSession.sessionId.toUShort(),
+                                Result.FailRecognizerError,
+                                buildList {
+                                    if (appInitiated && voiceSession.appUuid != null) {
+                                        add(VoiceAttribute.AppUuid().apply {
+                                            uuid.set(voiceSession.appUuid)
+                                        })
+                                    }
+                                }
+                        )
+                    } else {
+                        SessionSetupResult(
+                                sessionType = SessionType.Dictation,
+                                result = Result.FailRecognizerError
+                        )
+                    }
+                    if (appInitiated) {
+                        result.flags.set(1u)
+                    }
+                    pebbleDevice.voiceService.send(result)
+                }
+                .collect()
+    }
+
     private suspend fun listenForVoiceSessions() {
         for (message in pebbleDevice.voiceService.receivedMessages) {
             when (message) {
@@ -70,74 +170,9 @@ class VoiceSessionHandler(
                         val dictationService: DictationService by inject()
                         val voiceSession = VoiceSession(appUuid, message.sessionId.get().toInt(), encoderInfo, dictationService)
                         Logging.d("Received voice session: $voiceSession")
-
-                        var sentReady = false
-                        dictationService.handleSpeechStream(voiceSession.encoderInfo, voiceSession.audioStreamFrames)
-                                .takeWhile { it !is DictationServiceResponse.Complete }
-                                .onEach {
-                                    Logging.v("DictationServiceResponse: $it")
-                                }
-                                .collect {
-                                    when (it) {
-                                        is DictationServiceResponse.Ready -> {
-                                            pebbleDevice.activeVoiceSession.value = voiceSession
-                                            val result = SessionSetupResult(
-                                                    sessionType = SessionType.Dictation,
-                                                    result = Result.Success
-                                            )
-                                            if (appInitiated) {
-                                                result.flags.set(1u)
-                                            }
-                                            pebbleDevice.voiceService.send(result)
-                                            sentReady = true
-                                        }
-                                        is DictationServiceResponse.Error -> {
-                                            val result = if (sentReady) {
-                                                DictationResult(
-                                                        voiceSession.sessionId.toUShort(),
-                                                        it.result,
-                                                        buildList {
-                                                            if (appInitiated && voiceSession.appUuid != null) {
-                                                                add(VoiceAttribute.AppUuid().apply {
-                                                                    uuid.set(voiceSession.appUuid)
-                                                                })
-                                                            }
-                                                        }
-                                                )
-                                            } else {
-                                                SessionSetupResult(
-                                                        sessionType = SessionType.Dictation,
-                                                        result = it.result
-                                                )
-                                            }
-                                            if (appInitiated) {
-                                                result.flags.set(1u)
-                                            }
-                                            pebbleDevice.voiceService.send(result)
-                                        }
-                                        is DictationServiceResponse.Transcription -> {
-                                            val resp = DictationResult(
-                                                    voiceSession.sessionId.toUShort(),
-                                                    Result.Success,
-                                                    buildList {
-                                                        add(makeTranscription(it.sentences))
-                                                        if (appInitiated && voiceSession.appUuid != null) {
-                                                            add(VoiceAttribute(
-                                                                    id = VoiceAttributeType.AppUuid.value,
-                                                                    content = VoiceAttribute.AppUuid().apply {
-                                                                        uuid.set(voiceSession.appUuid)
-                                                                    }
-                                                            ))
-                                                        }
-                                                    }
-                                            )
-                                            if (appInitiated) {
-                                                resp.flags.set(1u)
-                                            }
-                                            pebbleDevice.voiceService.send(resp)
-                                        }
-                                    }
-                                }
+                        coroutineScope {
+                            launch { handleSpeechStream(voiceSession) }
+                        }
                     }
                 }