Speech recognition with .NET MAUI

[删除（380066935@qq.com或微信通知）]

更好的阅读体验请查看原文:https://vladislavantonyuk.github.io/articles/Speech-recognition-with-.NET-MAUI/

Hello!

If you feel alone and are looking for someone for a coffee talk, maybe that someone is in front of you. Yes, it is your device. 😉 Imagine, that you ask your device a question and it replies to you. This article is devoted to a built-in speech recognition mechanism.

.NET MAUI already has a mechanism to convert text to speech. There is a method SpeakAsync, that receives a text you want to hear:

await TextToSpeech.Default.SpeakAsync("Hello world!");

Let's create a similar API, but for Speech-To-Text.

Starting from the interface:

public interface ISpeechToText
{
	Task<string> Listen(CultureInfo culture, IProgress<string>? recognitionResult, CancellationToken cancellationToken);
}

where culture is our spoken language, recognitionResult is an intermediate response from the Recognizer, cancallationToken is used for stopping the process. The result of the method returns the final string output from the Recognizer.

Android

Speech recognizer requires access to a microphone and the Internet, so add these lines to AndroidManifest.xml:

<uses-permission android:name="android.permission.INTERNET" />
<uses-permission android:name="android.permission.RECORD_AUDIO" />

Now let's implement our ISpeechToText interface:

public sealed class SpeechToTextImplementation : ISpeechToText
{
	private SpeechRecognitionListener? listener;
	private SpeechRecognizer? speechRecognizer;

	public async Task<string> Listen(CultureInfo culture, IProgress<string>? recognitionResult, CancellationToken cancellationToken)
	{
		var taskResult = new TaskCompletionSource<string>();
		listener = new SpeechRecognitionListener
		{
			Error = ex => taskResult.TrySetException(new Exception("Failure in speech engine - " + ex)),
			PartialResults = sentence =>
			{
				recognitionResult?.Report(sentence);
			},
			Results = sentence => taskResult.TrySetResult(sentence)
		};
		speechRecognizer = SpeechRecognizer.CreateSpeechRecognizer(Android.App.Application.Context);
		if (speechRecognizer is null)
		{
			throw new ArgumentException("Speech recognizer is not available");
		}

		speechRecognizer.SetRecognitionListener(listener);
		speechRecognizer.StartListening(CreateSpeechIntent(culture));
		await using (cancellationToken.Register(() =>
		             {
			             StopRecording();
			             taskResult.TrySetCanceled();
		             }))
		{
			return await taskResult.Task;
		}
	}

	private void StopRecording()
	{
		speechRecognizer?.StopListening();
		speechRecognizer?.Destroy();
	}

	private Intent CreateSpeechIntent(CultureInfo culture)
	{
		var intent = new Intent(RecognizerIntent.ActionRecognizeSpeech);
		intent.PutExtra(RecognizerIntent.ExtraLanguagePreference, Java.Util.Locale.Default);
		var javaLocale = Java.Util.Locale.ForLanguageTag(culture.Name);
		intent.PutExtra(RecognizerIntent.ExtraLanguage, javaLocale);
		intent.PutExtra(RecognizerIntent.ExtraLanguageModel, RecognizerIntent.LanguageModelFreeForm);
		intent.PutExtra(RecognizerIntent.ExtraCallingPackage, Android.App.Application.Context.PackageName);
		intent.PutExtra(RecognizerIntent.ExtraPartialResults, true);

		return intent;
	}
}

public class SpeechRecognitionListener : Java.Lang.Object, IRecognitionListener
{
	public Action<SpeechRecognizerError>? Error { get; set; }
	public Action<string>? PartialResults { get; set; }
	public Action<string>? Results { get; set; }
	public void OnBeginningOfSpeech()
	{

	}

	public void OnBufferReceived(byte[]? buffer)
	{
	}

	public void OnEndOfSpeech()
	{
	}

	public void OnError([GeneratedEnum] SpeechRecognizerError error)
	{
		Error?.Invoke(error);
	}

	public void OnEvent(int eventType, Bundle? @params)
	{
	}

	public void OnPartialResults(Bundle? partialResults)
	{
		SendResults(partialResults, PartialResults);
	}

	public void OnReadyForSpeech(Bundle? @params)
	{
	}

	public void OnResults(Bundle? results)
	{
		SendResults(results, Results);
	}

	public void OnRmsChanged(float rmsdB)
	{
	}

	void SendResults(Bundle? bundle, Action<string>? action)
	{
		var matches = bundle?.GetStringArrayList(SpeechRecognizer.ResultsRecognition);
		if (matches == null || matches.Count == 0)
		{
			return;
		}

		action?.Invoke(matches.First());
	}
}

The main 2 lines here are:

speechRecognizer.SetRecognitionListener(listener);
speechRecognizer.StartListening(CreateSpeechIntent(culture));

The first line set the SpeechRecognitionListener that has a list of methods for different states of your speech recognition.

The second line creates speech intent, which has a configuration for speech recognizer and then starts the listening.

iOS/MacCatalyst

Speech recognizer requires access to a microphone, so add these lines to Info.plist:

<key>NSSpeechRecognitionUsageDescription</key>  
<string>Recognize</string>  
<key>NSMicrophoneUsageDescription</key>  
<string>Microphone usage</string>

Now let's implement our ISpeechToText interface:

public sealed class SpeechToTextImplementation : ISpeechToText
{
	private AVAudioEngine? audioEngine;
	private SFSpeechAudioBufferRecognitionRequest? liveSpeechRequest;
	private SFSpeechRecognizer? speechRecognizer;
	private SFSpeechRecognitionTask? recognitionTask;

	public async Task<string> Listen(CultureInfo culture, IProgress<string>? recognitionResult, CancellationToken cancellationToken)
	{
		speechRecognizer = new SFSpeechRecognizer(NSLocale.FromLocaleIdentifier(culture.Name));

		if (!speechRecognizer.Available)
		{
			throw new ArgumentException("Speech recognizer is not available");
		}

		if (SFSpeechRecognizer.AuthorizationStatus != SFSpeechRecognizerAuthorizationStatus.Authorized)
		{
			throw new Exception("Permission denied");
		}

		audioEngine = new AVAudioEngine();
		liveSpeechRequest = new SFSpeechAudioBufferRecognitionRequest();

#if MACCATALYST
		var audioSession = AVAudioSession.SharedInstance();
		audioSession.SetCategory(AVAudioSessionCategory.Record, AVAudioSessionCategoryOptions.DefaultToSpeaker);

		var mode = audioSession.AvailableModes.Contains("AVAudioSessionModeMeasurement") ? "AVAudioSessionModeMeasurement" : audioSession.AvailableModes.First();
		audioSession.SetMode(new NSString(mode), out var audioSessionError);
		if (audioSessionError != null)
		{
			throw new Exception(audioSessionError.LocalizedDescription);
		}

		audioSession.SetActive(true, AVAudioSessionSetActiveOptions.NotifyOthersOnDeactivation, out audioSessionError);
		if (audioSessionError is not null)
		{
			throw new Exception(audioSessionError.LocalizedDescription);
		}
#endif

		var node = audioEngine.InputNode;
		var recordingFormat = node.GetBusOutputFormat(new UIntPtr(0));
		node.InstallTapOnBus(new UIntPtr(0), 1024, recordingFormat, (buffer, _) =>
		{
			liveSpeechRequest.Append(buffer);
		});

		audioEngine.Prepare();
		audioEngine.StartAndReturnError(out var error);

		if (error is not null)
		{
			throw new ArgumentException("Error starting audio engine - " + error.LocalizedDescription);
		}

		var currentIndex = 0;
		var taskResult = new TaskCompletionSource<string>();
		recognitionTask = speechRecognizer.GetRecognitionTask(liveSpeechRequest, (result, err) =>
		{
			if (err != null)
			{
				StopRecording();
				taskResult.TrySetException(new Exception(err.LocalizedDescription));
			}
			else
			{
				if (result.Final)
				{
					currentIndex = 0;
					StopRecording();
					taskResult.TrySetResult(result.BestTranscription.FormattedString);
				}
				else
				{
					for (var i = currentIndex; i < result.BestTranscription.Segments.Length; i++)
					{
						var s = result.BestTranscription.Segments[i].Substring;
						currentIndex++;
						recognitionResult?.Report(s);
					}
				}
			}
		});

		await using (cancellationToken.Register(() =>
		             {
			             StopRecording();
			             taskResult.TrySetCanceled();
		             }))
		{
			return await taskResult.Task;
		}
	}

	void StopRecording()
	{
		audioEngine?.InputNode.RemoveTapOnBus(new UIntPtr(0));
		audioEngine?.Stop();
		liveSpeechRequest?.EndAudio();
		recognitionTask?.Cancel();
	}
}

Similar to Android here we also create SpeechRecognizer and configure AudioEngine. By analogy with SpeechRecognitionListener Apple has a method speechRecognizer.GetRecognitionTask where the second Action parameter contains recognition results.

Windows

Speech recognizer requires access to a microphone and the Internet (In case you choose online recognition), so add these lines to the Capabilities in Package.appxmanifest:

<Capability Name="internetClient" />
<DeviceCapability Name="microphone" />

The same as with Android and iOS we implement ISpeechToText interface:

public sealed class SpeechToTextImplementation : ISpeechToText
{
	private SpeechRecognitionEngine? speechRecognitionEngine;
	private SpeechRecognizer? speechRecognizer;
	private string? recognitionText;

	public async Task<string> ListenOnline(CultureInfo culture, IProgress<string>? recognitionResult, CancellationToken cancellationToken)
	{
		recognitionText = string.Empty;
		speechRecognizer = new SpeechRecognizer(new Language(culture.IetfLanguageTag));
		await speechRecognizer.CompileConstraintsAsync();

		var taskResult = new TaskCompletionSource<string>();
		speechRecognizer.ContinuousRecognitionSession.ResultGenerated += (s, e) =>
		{
			recognitionText += e.Result.Text;
			recognitionResult?.Report(e.Result.Text);
		};
		speechRecognizer.ContinuousRecognitionSession.Completed += (s, e) =>
		{
			switch (e.Status)
			{
				case SpeechRecognitionResultStatus.Success:
					taskResult.TrySetResult(recognitionText);
					break;
				case SpeechRecognitionResultStatus.UserCanceled:
					taskResult.TrySetCanceled();
					break;
				default:
					taskResult.TrySetException(new Exception(e.Status.ToString()));
					break;
			}
		};
		await speechRecognizer.ContinuousRecognitionSession.StartAsync();
		await using (cancellationToken.Register(async () =>
					 {
						 await StopRecording();
						 taskResult.TrySetCanceled();
					 }))
		{
			return await taskResult.Task;
		}
	}

	private async Task StopRecording()
	{
		try
		{
			await speechRecognizer?.ContinuousRecognitionSession.StopAsync();
		}
		catch
		{
			// ignored. Recording may be already stopped
		}
	}
}

Sample

And the most pleasant step to check that everything works:

try
{
	RecognitionText = await speechToText.Listen(CultureInfo.GetCultureInfo("en-us"), new Progress<string>(partialText =>
		{
			RecognitionText += partialText + " ";
		}), cancellationToken);
}
catch (Exception ex)
{
	await Toast.Make(ex.Message).Show(cancellationToken);
}

Windows recognition

The final code can be found on GitHub.

Happy coding and never be alone!