@@ -16,12 +16,12 @@ syntax = "proto3";
|
16 | 16 |
|
17 | 17 | package google.cloud.dialogflow.v2beta1;
|
18 | 18 |
|
| 19 | +import "google/api/annotations.proto"; |
19 | 20 | import "google/api/field_behavior.proto";
|
20 | 21 | import "google/api/resource.proto";
|
21 | 22 | import "google/protobuf/duration.proto";
|
22 | 23 | import "google/protobuf/field_mask.proto";
|
23 | 24 | import "google/protobuf/timestamp.proto";
|
24 |
| -import "google/api/annotations.proto"; |
25 | 25 |
|
26 | 26 | option cc_enable_arenas = true;
|
27 | 27 | option csharp_namespace = "Google.Cloud.Dialogflow.V2beta1";
|
@@ -31,36 +31,6 @@ option java_outer_classname = "AudioConfigProto";
|
31 | 31 | option java_package = "com.google.cloud.dialogflow.v2beta1";
|
32 | 32 | option objc_class_prefix = "DF";
|
33 | 33 |
|
34 |
| -// Hints for the speech recognizer to help with recognition in a specific |
35 |
| -// conversation state. |
36 |
| -message SpeechContext { |
37 |
| -// Optional. A list of strings containing words and phrases that the speech |
38 |
| -// recognizer should recognize with higher likelihood. |
39 |
| -// |
40 |
| -// This list can be used to: |
41 |
| -// |
42 |
| -// * improve accuracy for words and phrases you expect the user to say, |
43 |
| -// e.g. typical commands for your Dialogflow agent |
44 |
| -// * add additional words to the speech recognizer vocabulary |
45 |
| -// * ... |
46 |
| -// |
47 |
| -// See the [Cloud Speech |
48 |
| -// documentation](https://cloud.google.com/speech-to-text/quotas) for usage |
49 |
| -// limits. |
50 |
| -repeated string phrases = 1; |
51 |
| - |
52 |
| -// Optional. Boost for this context compared to other contexts: |
53 |
| -// |
54 |
| -// * If the boost is positive, Dialogflow will increase the probability that |
55 |
| -// the phrases in this context are recognized over similar sounding phrases. |
56 |
| -// * If the boost is unspecified or non-positive, Dialogflow will not apply |
57 |
| -// any boost. |
58 |
| -// |
59 |
| -// Dialogflow recommends that you use boosts in the range (0, 20] and that you |
60 |
| -// find a value that fits your use case with binary search. |
61 |
| -float boost = 2; |
62 |
| -} |
63 |
| - |
64 | 34 | // Audio encoding of the audio content sent in the conversational query request.
|
65 | 35 | // Refer to the
|
66 | 36 | // [Cloud Speech API
|
@@ -110,29 +80,34 @@ enum AudioEncoding {
|
110 | 80 | AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7;
|
111 | 81 | }
|
112 | 82 |
|
113 |
| -// Information for a word recognized by the speech recognizer. |
114 |
| -message SpeechWordInfo { |
115 |
| -// The word this info is for. |
116 |
| -string word = 3; |
117 |
| - |
118 |
| -// Time offset relative to the beginning of the audio that corresponds to the |
119 |
| -// start of the spoken word. This is an experimental feature and the accuracy |
120 |
| -// of the time offset can vary. |
121 |
| -google.protobuf.Duration start_offset = 1; |
122 |
| - |
123 |
| -// Time offset relative to the beginning of the audio that corresponds to the |
124 |
| -// end of the spoken word. This is an experimental feature and the accuracy of |
125 |
| -// the time offset can vary. |
126 |
| -google.protobuf.Duration end_offset = 2; |
| 83 | +// Hints for the speech recognizer to help with recognition in a specific |
| 84 | +// conversation state. |
| 85 | +message SpeechContext { |
| 86 | +// Optional. A list of strings containing words and phrases that the speech |
| 87 | +// recognizer should recognize with higher likelihood. |
| 88 | +// |
| 89 | +// This list can be used to: |
| 90 | +// |
| 91 | +// * improve accuracy for words and phrases you expect the user to say, |
| 92 | +// e.g. typical commands for your Dialogflow agent |
| 93 | +// * add additional words to the speech recognizer vocabulary |
| 94 | +// * ... |
| 95 | +// |
| 96 | +// See the [Cloud Speech |
| 97 | +// documentation](https://cloud.google.com/speech-to-text/quotas) for usage |
| 98 | +// limits. |
| 99 | +repeated string phrases = 1; |
127 | 100 |
|
128 |
| -// The Speech confidence between 0.0 and 1.0 for this word. A higher number |
129 |
| -// indicates an estimated greater likelihood that the recognized word is |
130 |
| -// correct. The default of 0.0 is a sentinel value indicating that confidence |
131 |
| -// was not set. |
| 101 | +// Optional. Boost for this context compared to other contexts: |
132 | 102 | //
|
133 |
| -// This field is not guaranteed to be fully stable over time for the same |
134 |
| -// audio input. Users should also not rely on it to always be provided. |
135 |
| -float confidence = 4; |
| 103 | +// * If the boost is positive, Dialogflow will increase the probability that |
| 104 | +// the phrases in this context are recognized over similar sounding phrases. |
| 105 | +// * If the boost is unspecified or non-positive, Dialogflow will not apply |
| 106 | +// any boost. |
| 107 | +// |
| 108 | +// Dialogflow recommends that you use boosts in the range (0, 20] and that you |
| 109 | +// find a value that fits your use case with binary search. |
| 110 | +float boost = 2; |
136 | 111 | }
|
137 | 112 |
|
138 | 113 | // Variant of the specified [Speech model][google.cloud.dialogflow.v2beta1.InputAudioConfig.model] to use.
|
@@ -178,6 +153,31 @@ enum SpeechModelVariant {
|
178 | 153 | USE_ENHANCED = 3;
|
179 | 154 | }
|
180 | 155 |
|
| 156 | +// Information for a word recognized by the speech recognizer. |
| 157 | +message SpeechWordInfo { |
| 158 | +// The word this info is for. |
| 159 | +string word = 3; |
| 160 | + |
| 161 | +// Time offset relative to the beginning of the audio that corresponds to the |
| 162 | +// start of the spoken word. This is an experimental feature and the accuracy |
| 163 | +// of the time offset can vary. |
| 164 | +google.protobuf.Duration start_offset = 1; |
| 165 | + |
| 166 | +// Time offset relative to the beginning of the audio that corresponds to the |
| 167 | +// end of the spoken word. This is an experimental feature and the accuracy of |
| 168 | +// the time offset can vary. |
| 169 | +google.protobuf.Duration end_offset = 2; |
| 170 | + |
| 171 | +// The Speech confidence between 0.0 and 1.0 for this word. A higher number |
| 172 | +// indicates an estimated greater likelihood that the recognized word is |
| 173 | +// correct. The default of 0.0 is a sentinel value indicating that confidence |
| 174 | +// was not set. |
| 175 | +// |
| 176 | +// This field is not guaranteed to be fully stable over time for the same |
| 177 | +// audio input. Users should also not rely on it to always be provided. |
| 178 | +float confidence = 4; |
| 179 | +} |
| 180 | + |
181 | 181 | // Instructs the speech recognizer on how to process the audio content.
|
182 | 182 | message InputAudioConfig {
|
183 | 183 | // Required. Audio encoding of the audio content to process.
|
@@ -257,6 +257,23 @@ message InputAudioConfig {
|
257 | 257 | bool disable_no_speech_recognized_event = 14;
|
258 | 258 | }
|
259 | 259 |
|
| 260 | +// Gender of the voice as described in |
| 261 | +// [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice). |
| 262 | +enum SsmlVoiceGender { |
| 263 | +// An unspecified gender, which means that the client doesn't care which |
| 264 | +// gender the selected voice will have. |
| 265 | +SSML_VOICE_GENDER_UNSPECIFIED = 0; |
| 266 | + |
| 267 | +// A male voice. |
| 268 | +SSML_VOICE_GENDER_MALE = 1; |
| 269 | + |
| 270 | +// A female voice. |
| 271 | +SSML_VOICE_GENDER_FEMALE = 2; |
| 272 | + |
| 273 | +// A gender-neutral voice. |
| 274 | +SSML_VOICE_GENDER_NEUTRAL = 3; |
| 275 | +} |
| 276 | + |
260 | 277 | // Description of which voice to use for speech synthesis.
|
261 | 278 | message VoiceSelectionParams {
|
262 | 279 | // Optional. The name of the voice. If not set, the service will choose a
|
@@ -307,47 +324,6 @@ message SynthesizeSpeechConfig {
|
307 | 324 | VoiceSelectionParams voice = 4;
|
308 | 325 | }
|
309 | 326 |
|
310 |
| -// Gender of the voice as described in |
311 |
| -// [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice). |
312 |
| -enum SsmlVoiceGender { |
313 |
| -// An unspecified gender, which means that the client doesn't care which |
314 |
| -// gender the selected voice will have. |
315 |
| -SSML_VOICE_GENDER_UNSPECIFIED = 0; |
316 |
| - |
317 |
| -// A male voice. |
318 |
| -SSML_VOICE_GENDER_MALE = 1; |
319 |
| - |
320 |
| -// A female voice. |
321 |
| -SSML_VOICE_GENDER_FEMALE = 2; |
322 |
| - |
323 |
| -// A gender-neutral voice. |
324 |
| -SSML_VOICE_GENDER_NEUTRAL = 3; |
325 |
| -} |
326 |
| - |
327 |
| -// Instructs the speech synthesizer how to generate the output audio content. |
328 |
| -// If this audio config is supplied in a request, it overrides all existing |
329 |
| -// text-to-speech settings applied to the agent. |
330 |
| -message OutputAudioConfig { |
331 |
| -// Required. Audio encoding of the synthesized audio content. |
332 |
| -OutputAudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED]; |
333 |
| - |
334 |
| -// The synthesis sample rate (in hertz) for this audio. If not |
335 |
| -// provided, then the synthesizer will use the default sample rate based on |
336 |
| -// the audio encoding. If this is different from the voice's natural sample |
337 |
| -// rate, then the synthesizer will honor this request by converting to the |
338 |
| -// desired sample rate (which might result in worse audio quality). |
339 |
| -int32 sample_rate_hertz = 2; |
340 |
| - |
341 |
| -// Configuration of how speech should be synthesized. |
342 |
| -SynthesizeSpeechConfig synthesize_speech_config = 3; |
343 |
| -} |
344 |
| - |
345 |
| -// A wrapper of repeated TelephonyDtmf digits. |
346 |
| -message TelephonyDtmfEvents { |
347 |
| -// A sequence of TelephonyDtmf digits. |
348 |
| -repeated TelephonyDtmf dtmf_events = 1; |
349 |
| -} |
350 |
| - |
351 | 327 | // Audio encoding of the output audio format in Text-To-Speech.
|
352 | 328 | enum OutputAudioEncoding {
|
353 | 329 | // Not specified.
|
@@ -373,16 +349,22 @@ enum OutputAudioEncoding {
|
373 | 349 | OUTPUT_AUDIO_ENCODING_MULAW = 5;
|
374 | 350 | }
|
375 | 351 |
|
376 |
| -// Configures speech transcription for [ConversationProfile][google.cloud.dialogflow.v2beta1.ConversationProfile]. |
377 |
| -message SpeechToTextConfig { |
378 |
| -// The speech model used in speech to text. |
379 |
| -// `SPEECH_MODEL_VARIANT_UNSPECIFIED`, `USE_BEST_AVAILABLE` will be treated as |
380 |
| -// `USE_ENHANCED`. It can be overridden in [AnalyzeContentRequest][google.cloud.dialogflow.v2beta1.AnalyzeContentRequest] and |
381 |
| -// [AnalyzeContentRequest][google.cloud.dialogflow.v2beta1.AnalyzeContentRequest] request. |
382 |
| -// If enhanced model variant is specified and an enhanced |
383 |
| -// version of the specified model for the language does not exist, then it |
384 |
| -// would emit an error. |
385 |
| -SpeechModelVariant speech_model_variant = 1; |
| 352 | +// Instructs the speech synthesizer how to generate the output audio content. |
| 353 | +// If this audio config is supplied in a request, it overrides all existing |
| 354 | +// text-to-speech settings applied to the agent. |
| 355 | +message OutputAudioConfig { |
| 356 | +// Required. Audio encoding of the synthesized audio content. |
| 357 | +OutputAudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED]; |
| 358 | + |
| 359 | +// The synthesis sample rate (in hertz) for this audio. If not |
| 360 | +// provided, then the synthesizer will use the default sample rate based on |
| 361 | +// the audio encoding. If this is different from the voice's natural sample |
| 362 | +// rate, then the synthesizer will honor this request by converting to the |
| 363 | +// desired sample rate (which might result in worse audio quality). |
| 364 | +int32 sample_rate_hertz = 2; |
| 365 | + |
| 366 | +// Configuration of how speech should be synthesized. |
| 367 | +SynthesizeSpeechConfig synthesize_speech_config = 3; |
386 | 368 | }
|
387 | 369 |
|
388 | 370 | // [DTMF](https://en.wikipedia.org/wiki/Dual-tone_multi-frequency_signaling)
|
@@ -439,3 +421,21 @@ enum TelephonyDtmf {
|
439 | 421 | // Pound/diamond/hash/square/gate/octothorpe: '#'.
|
440 | 422 | DTMF_POUND = 16;
|
441 | 423 | }
|
| 424 | + |
| 425 | +// A wrapper of repeated TelephonyDtmf digits. |
| 426 | +message TelephonyDtmfEvents { |
| 427 | +// A sequence of TelephonyDtmf digits. |
| 428 | +repeated TelephonyDtmf dtmf_events = 1; |
| 429 | +} |
| 430 | + |
| 431 | +// Configures speech transcription for [ConversationProfile][google.cloud.dialogflow.v2beta1.ConversationProfile]. |
| 432 | +message SpeechToTextConfig { |
| 433 | +// The speech model used in speech to text. |
| 434 | +// `SPEECH_MODEL_VARIANT_UNSPECIFIED`, `USE_BEST_AVAILABLE` will be treated as |
| 435 | +// `USE_ENHANCED`. It can be overridden in [AnalyzeContentRequest][google.cloud.dialogflow.v2beta1.AnalyzeContentRequest] and |
| 436 | +// [AnalyzeContentRequest][google.cloud.dialogflow.v2beta1.AnalyzeContentRequest] request. |
| 437 | +// If enhanced model variant is specified and an enhanced |
| 438 | +// version of the specified model for the language does not exist, then it |
| 439 | +// would emit an error. |
| 440 | +SpeechModelVariant speech_model_variant = 1; |
| 441 | +} |
0 commit comments