This patch series is part of a bigger incoming branch. The intention of this branch is to get WinRT/UWP speech recognition working. Primarly for the game Phasmophobia. All changes, currently strong WIP, can be found on my GitHub repo: https://github.com/besentv/wine
Some notes on the current progress: Currently libVosk is used to do the speech to text part, which works pretty well. Though there are other issues like the ingame chat breaking which I am currently investigating.
Bernhard
Bernhard Kölbl (5): include/windows.media.speechrecognition.idl: Add SpeechRecognitionResult. include/windows.media.speechrecognition.idl: Add SpeechContinuousRecognitionSession. include/windows.media.speechrecognition.idl: Add SpeechRecognitionHypothesis. include/windows.media.speechrecognition.idl: Add SpeechRecognitionHypothesisGeneratedEventArgs. include/windows.media.speechrecognition.idl: Add SpeechRecognizerTimeouts.
include/windows.media.speechrecognition.idl | 253 ++++++++++++++++++++ 1 file changed, 253 insertions(+)
And it's dependencies.
Signed-off-by: Bernhard Kölbl besentv@gmail.com --- include/windows.media.speechrecognition.idl | 92 +++++++++++++++++++++ 1 file changed, 92 insertions(+)
diff --git a/include/windows.media.speechrecognition.idl b/include/windows.media.speechrecognition.idl index 65c9d5ef8da..1dd73a2637d 100644 --- a/include/windows.media.speechrecognition.idl +++ b/include/windows.media.speechrecognition.idl @@ -32,9 +32,16 @@ import "windows.globalization.idl"; namespace Windows { namespace Media { namespace SpeechRecognition { + typedef enum SpeechRecognitionConfidence SpeechRecognitionConfidence; typedef enum SpeechRecognitionConstraintProbability SpeechRecognitionConstraintProbability; typedef enum SpeechRecognitionConstraintType SpeechRecognitionConstraintType; + typedef enum SpeechRecognitionResultStatus SpeechRecognitionResultStatus; interface ISpeechRecognitionConstraint; + interface ISpeechRecognitionResult; + interface ISpeechRecognitionResult2; + interface ISpeechRecognitionSemanticInterpretation; + runtimeclass SpeechRecognitionResult; + runtimeclass SpeechRecognitionSemanticInterpretation; } } } @@ -43,6 +50,9 @@ namespace Windows { namespace Media { namespace SpeechRecognition { declare { + interface Windows.Foundation.Collections.IIterable<Windows.Media.SpeechRecognition.SpeechRecognitionResult*>; + interface Windows.Foundation.Collections.IIterator<Windows.Media.SpeechRecognition.SpeechRecognitionResult*>; + interface Windows.Foundation.Collections.IVectorView<Windows.Media.SpeechRecognition.SpeechRecognitionResult*>; } } } @@ -52,6 +62,15 @@ namespace Windows { namespace Media { namespace SpeechRecognition {
+ [contract(Windows.Foundation.UniversalApiContract, 1.0)] + enum SpeechRecognitionConfidence + { + High = 0, + Medium = 1, + Low = 2, + Rejected = 3, + }; + [contract(Windows.Foundation.UniversalApiContract, 1.0)] enum SpeechRecognitionConstraintProbability { @@ -69,6 +88,22 @@ namespace Windows { VoiceCommandDefinition = 3, };
+ [contract(Windows.Foundation.UniversalApiContract, 1.0)] + enum SpeechRecognitionResultStatus + { + Success = 0, + TopicLanguageNotSupported = 1, + GrammarLanguageMismatch = 2, + GrammarCompilationFailure = 3, + AudioQualityFailure = 4, + UserCanceled = 5, + Unknown = 6, + TimeoutExceeded = 7, + PauseLimitExceeded = 8, + NetworkFailure = 9, + MicrophoneUnavailable = 10, + }; + [ contract(Windows.Foundation.UniversalApiContract, 1.0), uuid(79ac1628-4d68-43c4-8911-40dc4101b55b) @@ -83,6 +118,63 @@ namespace Windows { [propget] HRESULT Probability([out, retval] Windows.Media.SpeechRecognition.SpeechRecognitionConstraintProbability *value); [propput] HRESULT Probability([in] Windows.Media.SpeechRecognition.SpeechRecognitionConstraintProbability value); } + + [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + exclusiveto(Windows.Media.SpeechRecognition.SpeechRecognitionResult), + uuid(4e303157-034e-4652-857e-d0454cc4beec) + ] + interface ISpeechRecognitionResult : IInspectable + { + [propget] HRESULT Status([out, retval] Windows.Media.SpeechRecognition.SpeechRecognitionResultStatus *value); + [propget] HRESULT Text([out, retval] HSTRING* value); + [propget] HRESULT Confidence([out, retval] Windows.Media.SpeechRecognition.SpeechRecognitionConfidence *value); + [propget] HRESULT SemanticInterpretation([out, retval] Windows.Media.SpeechRecognition.SpeechRecognitionSemanticInterpretation **value); + HRESULT GetAlternatives([in] UINT32 max_amount, [out, retval] Windows.Foundation.Collections.IVectorView<Windows.Media.SpeechRecognition.SpeechRecognitionResult*> **results); + [propget] HRESULT Constraint([out, retval] Windows.Media.SpeechRecognition.ISpeechRecognitionConstraint **value); + [propget] HRESULT RulePath([out, retval] Windows.Foundation.Collections.IVectorView<HSTRING> **value); + [propget] HRESULT RawConfidence([out, retval] DOUBLE *value); + } + + [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + exclusiveto(Windows.Media.SpeechRecognition.SpeechRecognitionResult), + uuid(af7ed1ba-451b-4166-a0c1-1ffe84032d03) + ] + interface ISpeechRecognitionResult2 : IInspectable + { + [propget] HRESULT PhraseStartTime([out, retval] Windows.Foundation.DateTime *value); + [propget] HRESULT PhraseDuration([out, retval] Windows.Foundation.TimeSpan *value); + } + + [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + exclusiveto(Windows.Media.SpeechRecognition.SpeechRecognitionSemanticInterpretation), + uuid(aae1da9b-7e32-4c1f-89fe-0c65f486f52e) + ] + interface ISpeechRecognitionSemanticInterpretation : IInspectable + { + [propget] HRESULT Properties([out, retval] Windows.Foundation.Collections.IMapView<HSTRING, Windows.Foundation.Collections.IVectorView<HSTRING>*> **value); + } + + [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + marshaling_behavior(agile) + ] + runtimeclass SpeechRecognitionResult + { + [default] interface Windows.Media.SpeechRecognition.ISpeechRecognitionResult; + [contract(Windows.Foundation.UniversalApiContract, 1.0)] interface Windows.Media.SpeechRecognition.ISpeechRecognitionResult2; + } + + [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + marshaling_behavior(agile) + ] + runtimeclass SpeechRecognitionSemanticInterpretation + { + [default] interface Windows.Media.SpeechRecognition.ISpeechRecognitionSemanticInterpretation; + } } } }
And it's dependencies.
Signed-off-by: Bernhard Kölbl besentv@gmail.com --- include/windows.media.speechrecognition.idl | 91 +++++++++++++++++++++ 1 file changed, 91 insertions(+)
diff --git a/include/windows.media.speechrecognition.idl b/include/windows.media.speechrecognition.idl index 1dd73a2637d..48bbb26ec5a 100644 --- a/include/windows.media.speechrecognition.idl +++ b/include/windows.media.speechrecognition.idl @@ -32,14 +32,21 @@ import "windows.globalization.idl"; namespace Windows { namespace Media { namespace SpeechRecognition { + typedef enum SpeechContinuousRecognitionMode SpeechContinuousRecognitionMode; typedef enum SpeechRecognitionConfidence SpeechRecognitionConfidence; typedef enum SpeechRecognitionConstraintProbability SpeechRecognitionConstraintProbability; typedef enum SpeechRecognitionConstraintType SpeechRecognitionConstraintType; typedef enum SpeechRecognitionResultStatus SpeechRecognitionResultStatus; + interface ISpeechContinuousRecognitionCompletedEventArgs; + interface ISpeechContinuousRecognitionResultGeneratedEventArgs; + interface ISpeechContinuousRecognitionSession; interface ISpeechRecognitionConstraint; interface ISpeechRecognitionResult; interface ISpeechRecognitionResult2; interface ISpeechRecognitionSemanticInterpretation; + runtimeclass SpeechContinuousRecognitionCompletedEventArgs; + runtimeclass SpeechContinuousRecognitionResultGeneratedEventArgs; + runtimeclass SpeechContinuousRecognitionSession; runtimeclass SpeechRecognitionResult; runtimeclass SpeechRecognitionSemanticInterpretation; } @@ -53,6 +60,8 @@ namespace Windows { interface Windows.Foundation.Collections.IIterable<Windows.Media.SpeechRecognition.SpeechRecognitionResult*>; interface Windows.Foundation.Collections.IIterator<Windows.Media.SpeechRecognition.SpeechRecognitionResult*>; interface Windows.Foundation.Collections.IVectorView<Windows.Media.SpeechRecognition.SpeechRecognitionResult*>; + interface Windows.Foundation.TypedEventHandler<Windows.Media.SpeechRecognition.SpeechContinuousRecognitionSession*, Windows.Media.SpeechRecognition.SpeechContinuousRecognitionCompletedEventArgs*>; + interface Windows.Foundation.TypedEventHandler<Windows.Media.SpeechRecognition.SpeechContinuousRecognitionSession*, Windows.Media.SpeechRecognition.SpeechContinuousRecognitionResultGeneratedEventArgs*>; } } } @@ -62,6 +71,13 @@ namespace Windows { namespace Media { namespace SpeechRecognition {
+ [contract(Windows.Foundation.UniversalApiContract, 1.0)] + enum SpeechContinuousRecognitionMode + { + Default = 0, + PauseOnRecognition = 1 + }; + [contract(Windows.Foundation.UniversalApiContract, 1.0)] enum SpeechRecognitionConfidence { @@ -104,6 +120,51 @@ namespace Windows { MicrophoneUnavailable = 10, };
+ [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + exclusiveto(Windows.Media.SpeechRecognition.SpeechContinuousRecognitionCompletedEventArgs), + uuid(e3d069bb-e30c-5e18-424b-7fbe81f8fbd0) + ] + interface ISpeechContinuousRecognitionCompletedEventArgs : IInspectable + { + [propget] HRESULT Status([out, retval] Windows.Media.SpeechRecognition.SpeechRecognitionResultStatus *value); + } + + [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + exclusiveto(Windows.Media.SpeechRecognition.SpeechContinuousRecognitionResultGeneratedEventArgs), + uuid(19091e1e-6e7e-5a46-40fb-76594f786504) + ] + interface ISpeechContinuousRecognitionResultGeneratedEventArgs : IInspectable + { + [propget] HRESULT Result([out, retval] Windows.Media.SpeechRecognition.SpeechRecognitionResult **value); + } + + [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + exclusiveto(Windows.Media.SpeechRecognition.SpeechContinuousRecognitionSession), + uuid(6a213c04-6614-49f8-99a2-b5e9b3a085c8) + ] + interface ISpeechContinuousRecognitionSession : IInspectable + { + [propget] HRESULT AutoStopSilenceTimeout([out, retval] Windows.Foundation.TimeSpan *value); + [propput] HRESULT AutoStopSilenceTimeout([in] Windows.Foundation.TimeSpan value); + [overload("StartAsync")] HRESULT StartAsync([out, retval] Windows.Foundation.IAsyncAction **action); + [overload("StartAsync")] HRESULT StartWithModeAsync([in] Windows.Media.SpeechRecognition.SpeechContinuousRecognitionMode mode, [out, retval] Windows.Foundation.IAsyncAction **action); + HRESULT StopAsync([out, retval] Windows.Foundation.IAsyncAction **action); + HRESULT CancelAsync([out, retval] Windows.Foundation.IAsyncAction **action); + HRESULT PauseAsync([out, retval] Windows.Foundation.IAsyncAction **action); + HRESULT Resume(); + [eventadd] HRESULT Completed( + [in] Windows.Foundation.TypedEventHandler<Windows.Media.SpeechRecognition.SpeechContinuousRecognitionSession*, Windows.Media.SpeechRecognition.SpeechContinuousRecognitionCompletedEventArgs*> *value, + [out, retval] EventRegistrationToken *token); + [eventremove] HRESULT Completed([in] EventRegistrationToken token); + [eventadd] HRESULT ResultGenerated( + [in] Windows.Foundation.TypedEventHandler<Windows.Media.SpeechRecognition.SpeechContinuousRecognitionSession*, Windows.Media.SpeechRecognition.SpeechContinuousRecognitionResultGeneratedEventArgs*> *value, + [out, retval] EventRegistrationToken *token); + [eventremove] HRESULT ResultGenerated([in] EventRegistrationToken token); + } + [ contract(Windows.Foundation.UniversalApiContract, 1.0), uuid(79ac1628-4d68-43c4-8911-40dc4101b55b) @@ -157,6 +218,36 @@ namespace Windows { [propget] HRESULT Properties([out, retval] Windows.Foundation.Collections.IMapView<HSTRING, Windows.Foundation.Collections.IVectorView<HSTRING>*> **value); }
+ [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + marshaling_behavior(agile), + threading(both) + ] + runtimeclass SpeechContinuousRecognitionCompletedEventArgs + { + [default] interface Windows.Media.SpeechRecognition.ISpeechContinuousRecognitionCompletedEventArgs; + } + + [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + marshaling_behavior(agile), + threading(both) + ] + runtimeclass SpeechContinuousRecognitionResultGeneratedEventArgs + { + [default] interface Windows.Media.SpeechRecognition.ISpeechContinuousRecognitionResultGeneratedEventArgs; + } + + [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + marshaling_behavior(agile), + threading(both) + ] + runtimeclass SpeechContinuousRecognitionSession + { + [default] interface Windows.Media.SpeechRecognition.ISpeechContinuousRecognitionSession; + } + [ contract(Windows.Foundation.UniversalApiContract, 1.0), marshaling_behavior(agile)
Signed-off-by: Bernhard Kölbl besentv@gmail.com --- include/windows.media.speechrecognition.idl | 22 +++++++++++++++++++++ 1 file changed, 22 insertions(+)
diff --git a/include/windows.media.speechrecognition.idl b/include/windows.media.speechrecognition.idl index 48bbb26ec5a..9ecb3213663 100644 --- a/include/windows.media.speechrecognition.idl +++ b/include/windows.media.speechrecognition.idl @@ -41,12 +41,14 @@ namespace Windows { interface ISpeechContinuousRecognitionResultGeneratedEventArgs; interface ISpeechContinuousRecognitionSession; interface ISpeechRecognitionConstraint; + interface ISpeechRecognitionHypothesis; interface ISpeechRecognitionResult; interface ISpeechRecognitionResult2; interface ISpeechRecognitionSemanticInterpretation; runtimeclass SpeechContinuousRecognitionCompletedEventArgs; runtimeclass SpeechContinuousRecognitionResultGeneratedEventArgs; runtimeclass SpeechContinuousRecognitionSession; + runtimeclass SpeechRecognitionHypothesis; runtimeclass SpeechRecognitionResult; runtimeclass SpeechRecognitionSemanticInterpretation; } @@ -180,6 +182,16 @@ namespace Windows { [propput] HRESULT Probability([in] Windows.Media.SpeechRecognition.SpeechRecognitionConstraintProbability value); }
+ [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + exclusiveto(Windows.Media.SpeechRecognition.SpeechRecognitionHypothesis), + uuid(7a7b25b0-99c5-4f7d-bf84-10aa1302b634) + ] + interface ISpeechRecognitionHypothesis : IInspectable + { + [propget] HRESULT Text([out, retval] HSTRING *value); + } + [ contract(Windows.Foundation.UniversalApiContract, 1.0), exclusiveto(Windows.Media.SpeechRecognition.SpeechRecognitionResult), @@ -248,6 +260,16 @@ namespace Windows { [default] interface Windows.Media.SpeechRecognition.ISpeechContinuousRecognitionSession; }
+ [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + marshaling_behavior(agile), + threading(both) + ] + runtimeclass SpeechRecognitionHypothesis + { + [default] interface Windows.Media.SpeechRecognition.ISpeechRecognitionHypothesis; + } + [ contract(Windows.Foundation.UniversalApiContract, 1.0), marshaling_behavior(agile)
Signed-off-by: Bernhard Kölbl besentv@gmail.com --- include/windows.media.speechrecognition.idl | 22 +++++++++++++++++++++ 1 file changed, 22 insertions(+)
diff --git a/include/windows.media.speechrecognition.idl b/include/windows.media.speechrecognition.idl index 9ecb3213663..76352531116 100644 --- a/include/windows.media.speechrecognition.idl +++ b/include/windows.media.speechrecognition.idl @@ -42,6 +42,7 @@ namespace Windows { interface ISpeechContinuousRecognitionSession; interface ISpeechRecognitionConstraint; interface ISpeechRecognitionHypothesis; + interface ISpeechRecognitionHypothesisGeneratedEventArgs; interface ISpeechRecognitionResult; interface ISpeechRecognitionResult2; interface ISpeechRecognitionSemanticInterpretation; @@ -49,6 +50,7 @@ namespace Windows { runtimeclass SpeechContinuousRecognitionResultGeneratedEventArgs; runtimeclass SpeechContinuousRecognitionSession; runtimeclass SpeechRecognitionHypothesis; + runtimeclass SpeechRecognitionHypothesisGeneratedEventArgs; runtimeclass SpeechRecognitionResult; runtimeclass SpeechRecognitionSemanticInterpretation; } @@ -192,6 +194,16 @@ namespace Windows { [propget] HRESULT Text([out, retval] HSTRING *value); }
+ [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + exclusiveto(Windows.Media.SpeechRecognition.SpeechRecognitionHypothesisGeneratedEventArgs), + uuid(55161a7a-8023-5866-411d-1213bb271476) + ] + interface ISpeechRecognitionHypothesisGeneratedEventArgs : IInspectable + { + [propget] HRESULT Hypothesis([out, retval] Windows.Media.SpeechRecognition.SpeechRecognitionHypothesis **value); + } + [ contract(Windows.Foundation.UniversalApiContract, 1.0), exclusiveto(Windows.Media.SpeechRecognition.SpeechRecognitionResult), @@ -270,6 +282,16 @@ namespace Windows { [default] interface Windows.Media.SpeechRecognition.ISpeechRecognitionHypothesis; }
+ [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + marshaling_behavior(agile), + threading(both) + ] + runtimeclass SpeechRecognitionHypothesisGeneratedEventArgs + { + [default] interface Windows.Media.SpeechRecognition.ISpeechRecognitionHypothesisGeneratedEventArgs; + } + [ contract(Windows.Foundation.UniversalApiContract, 1.0), marshaling_behavior(agile)
Signed-off-by: Bernhard Kölbl besentv@gmail.com --- include/windows.media.speechrecognition.idl | 26 +++++++++++++++++++++ 1 file changed, 26 insertions(+)
diff --git a/include/windows.media.speechrecognition.idl b/include/windows.media.speechrecognition.idl index 76352531116..6ae1baf2040 100644 --- a/include/windows.media.speechrecognition.idl +++ b/include/windows.media.speechrecognition.idl @@ -46,6 +46,7 @@ namespace Windows { interface ISpeechRecognitionResult; interface ISpeechRecognitionResult2; interface ISpeechRecognitionSemanticInterpretation; + interface ISpeechRecognizerTimeouts; runtimeclass SpeechContinuousRecognitionCompletedEventArgs; runtimeclass SpeechContinuousRecognitionResultGeneratedEventArgs; runtimeclass SpeechContinuousRecognitionSession; @@ -53,6 +54,7 @@ namespace Windows { runtimeclass SpeechRecognitionHypothesisGeneratedEventArgs; runtimeclass SpeechRecognitionResult; runtimeclass SpeechRecognitionSemanticInterpretation; + runtimeclass SpeechRecognizerTimeouts; } } } @@ -242,6 +244,21 @@ namespace Windows { [propget] HRESULT Properties([out, retval] Windows.Foundation.Collections.IMapView<HSTRING, Windows.Foundation.Collections.IVectorView<HSTRING>*> **value); }
+ [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + exclusiveto(Windows.Media.SpeechRecognition.SpeechRecognizerTimeouts), + uuid(2ef76fca-6a3c-4dca-a153-df1bc88a79af) + ] + interface ISpeechRecognizerTimeouts : IInspectable + { + [propget] HRESULT InitialSilenceTimeout([out, retval] Windows.Foundation.TimeSpan *value); + [propput] HRESULT InitialSilenceTimeout([in] Windows.Foundation.TimeSpan value); + [propget] HRESULT EndSilenceTimeout([out, retval] Windows.Foundation.TimeSpan *value); + [propput] HRESULT EndSilenceTimeout([in] Windows.Foundation.TimeSpan value); + [propget] HRESULT BabbleTimeout([out, retval] Windows.Foundation.TimeSpan *value); + [propput] HRESULT BabbleTimeout([in] Windows.Foundation.TimeSpan value); + } + [ contract(Windows.Foundation.UniversalApiContract, 1.0), marshaling_behavior(agile), @@ -310,6 +327,15 @@ namespace Windows { { [default] interface Windows.Media.SpeechRecognition.ISpeechRecognitionSemanticInterpretation; } + + [ + contract(Windows.Foundation.UniversalApiContract, 1.0), + marshaling_behavior(agile) + ] + runtimeclass SpeechRecognizerTimeouts + { + [default] interface Windows.Media.SpeechRecognition.ISpeechRecognizerTimeouts; + } } } }