Update all json files; make some fixes to make it work at all

2026-01-22 11:11:25 +01:00 · 2022-03-08 07:59:31 +08:00
parent d12cd3238e
commit 743a56f4b7
372 changed files with 256017 additions and 32197 deletions
--- a/etc/api/speech/v1p1beta1/speech-api.json
+++ b/etc/api/speech/v1p1beta1/speech-api.json
@@ -3,7 +3,7 @@
        "oauth2": {
            "scopes": {
                "https://www.googleapis.com/auth/cloud-platform": {
-                    "description": "See, edit, configure, and delete your Google Cloud Platform data"
+                    "description": "See, edit, configure, and delete your Google Cloud data and see the email address for your Google Account."
                }
            }
        }
@@ -187,7 +187,7 @@
                                    ],
                                    "parameters": {
                                        "parent": {
-                                            "description": "Required. The parent resource where this custom class will be created. Format: {api_version}/projects/{project}/locations/{location}/customClasses",
+                                            "description": "Required. The parent resource where this custom class will be created. Format: `projects/{project}/locations/{location}/customClasses` Speech-to-Text supports three locations: `global`, `us` (US North America), and `eu` (Europe). If you are calling the `speech.googleapis.com` endpoint, use the `global` location. To specify a region, use a [regional endpoint](/speech-to-text/docs/endpoints) with matching `us` or `eu` location value.",
                                            "location": "path",
                                            "pattern": "^projects/[^/]+/locations/[^/]+$",
                                            "required": true,
@@ -215,7 +215,7 @@
                                    ],
                                    "parameters": {
                                        "name": {
-                                            "description": "Required. The name of the custom class to delete. Format: {api_version}/projects/{project}/locations/{location}/customClasses/{custom_class}",
+                                            "description": "Required. The name of the custom class to delete. Format: `projects/{project}/locations/{location}/customClasses/{custom_class}` Speech-to-Text supports three locations: `global`, `us` (US North America), and `eu` (Europe). If you are calling the `speech.googleapis.com` endpoint, use the `global` location. To specify a region, use a [regional endpoint](/speech-to-text/docs/endpoints) with matching `us` or `eu` location value.",
                                            "location": "path",
                                            "pattern": "^projects/[^/]+/locations/[^/]+/customClasses/[^/]+$",
                                            "required": true,
@@ -240,7 +240,7 @@
                                    ],
                                    "parameters": {
                                        "name": {
-                                            "description": "Required. The name of the custom class to retrieve. Format: {api_version}/projects/{project}/locations/{location}/customClasses/{custom_class}",
+                                            "description": "Required. The name of the custom class to retrieve. Format: `projects/{project}/locations/{location}/customClasses/{custom_class}`",
                                            "location": "path",
                                            "pattern": "^projects/[^/]+/locations/[^/]+/customClasses/[^/]+$",
                                            "required": true,
@@ -276,7 +276,7 @@
                                            "type": "string"
                                        },
                                        "parent": {
-                                            "description": "Required. The parent, which owns this collection of custom classes. Format: {api_version}/projects/{project}/locations/{location}/customClasses",
+                                            "description": "Required. The parent, which owns this collection of custom classes. Format: `projects/{project}/locations/{location}/customClasses` Speech-to-Text supports three locations: `global`, `us` (US North America), and `eu` (Europe). If you are calling the `speech.googleapis.com` endpoint, use the `global` location. To specify a region, use a [regional endpoint](/speech-to-text/docs/endpoints) with matching `us` or `eu` location value.",
                                            "location": "path",
                                            "pattern": "^projects/[^/]+/locations/[^/]+$",
                                            "required": true,
@@ -339,7 +339,7 @@
                                    ],
                                    "parameters": {
                                        "parent": {
-                                            "description": "Required. The parent resource where this phrase set will be created. Format: {api_version}/projects/{project}/locations/{location}/phraseSets",
+                                            "description": "Required. The parent resource where this phrase set will be created. Format: `projects/{project}/locations/{location}/phraseSets` Speech-to-Text supports three locations: `global`, `us` (US North America), and `eu` (Europe). If you are calling the `speech.googleapis.com` endpoint, use the `global` location. To specify a region, use a [regional endpoint](/speech-to-text/docs/endpoints) with matching `us` or `eu` location value.",
                                            "location": "path",
                                            "pattern": "^projects/[^/]+/locations/[^/]+$",
                                            "required": true,
@@ -367,7 +367,7 @@
                                    ],
                                    "parameters": {
                                        "name": {
-                                            "description": "Required. The name of the phrase set to delete. Format: {api_version}/projects/{project}/locations/{location}/phraseSets/{phrase_set}",
+                                            "description": "Required. The name of the phrase set to delete. Format: `projects/{project}/locations/{location}/phraseSets/{phrase_set}`",
                                            "location": "path",
                                            "pattern": "^projects/[^/]+/locations/[^/]+/phraseSets/[^/]+$",
                                            "required": true,
@@ -392,7 +392,7 @@
                                    ],
                                    "parameters": {
                                        "name": {
-                                            "description": "Required. The name of the phrase set to retrieve. Format: {api_version}/projects/{project}/locations/{location}/phraseSets/{phrase_set}",
+                                            "description": "Required. The name of the phrase set to retrieve. Format: `projects/{project}/locations/{location}/phraseSets/{phrase_set}` Speech-to-Text supports three locations: `global`, `us` (US North America), and `eu` (Europe). If you are calling the `speech.googleapis.com` endpoint, use the `global` location. To specify a region, use a [regional endpoint](/speech-to-text/docs/endpoints) with matching `us` or `eu` location value.",
                                            "location": "path",
                                            "pattern": "^projects/[^/]+/locations/[^/]+/phraseSets/[^/]+$",
                                            "required": true,
@@ -428,7 +428,7 @@
                                            "type": "string"
                                        },
                                        "parent": {
-                                            "description": "Required. The parent, which owns this collection of phrase set. Format: projects/{project}/locations/{location}",
+                                            "description": "Required. The parent, which owns this collection of phrase set. Format: `projects/{project}/locations/{location}` Speech-to-Text supports three locations: `global`, `us` (US North America), and `eu` (Europe). If you are calling the `speech.googleapis.com` endpoint, use the `global` location. To specify a region, use a [regional endpoint](/speech-to-text/docs/endpoints) with matching `us` or `eu` location value.",
                                            "location": "path",
                                            "pattern": "^projects/[^/]+/locations/[^/]+$",
                                            "required": true,
@@ -524,7 +524,7 @@
            }
        }
    },
-    "revision": "20210325",
+    "revision": "20220221",
    "rootUrl": "https://speech.googleapis.com/",
    "schemas": {
        "ClassItem": {
@@ -547,7 +547,7 @@
                    "description": "Required. The custom class to create."
                },
                "customClassId": {
-                    "description": "The ID to use for the custom class, which will become the final component of the custom class' resource name. This value should be 4-63 characters, and valid characters are /a-z-/.",
+                    "description": "Required. The ID to use for the custom class, which will become the final component of the custom class' resource name. This value should be 4-63 characters, and valid characters are /a-z-/.",
                    "type": "string"
                }
            },
@@ -562,7 +562,7 @@
                    "description": "Required. The phrase set to create."
                },
                "phraseSetId": {
-                    "description": "The ID to use for the phrase set, which will become the final component of the phrase set's resource name. This value should be 4-63 characters, and valid characters are /a-z-/.",
+                    "description": "Required. The ID to use for the phrase set, which will become the final component of the phrase set's resource name. This value should be 4-63 characters, and valid characters are /a-z-/.",
                    "type": "string"
                }
            },
@@ -596,6 +596,25 @@
            "properties": {},
            "type": "object"
        },
+        "Entry": {
+            "description": "A single replacement configuration.",
+            "id": "Entry",
+            "properties": {
+                "caseSensitive": {
+                    "description": "Whether the search is case sensitive.",
+                    "type": "boolean"
+                },
+                "replace": {
+                    "description": "What to replace with. Max length is 100 characters.",
+                    "type": "string"
+                },
+                "search": {
+                    "description": "What to replace. Max length is 100 characters.",
+                    "type": "string"
+                }
+            },
+            "type": "object"
+        },
        "ListCustomClassesResponse": {
            "description": "Message returned to the client by the `ListCustomClasses` method.",
            "id": "ListCustomClassesResponse",
@@ -719,6 +738,11 @@
                        "$ref": "SpeechRecognitionResult"
                    },
                    "type": "array"
+                },
+                "totalBilledTime": {
+                    "description": "When available, billed audio seconds for the corresponding request.",
+                    "format": "google-duration",
+                    "type": "string"
                }
            },
            "type": "object"
@@ -759,11 +783,11 @@
            "type": "object"
        },
        "Phrase": {
-            "description": "A phrases containing words and phrase \"hints\" so that the speech recognition is more likely to recognize them. This can be used to improve the accuracy for specific words and phrases, for example, if specific commands are typically spoken by the user. This can also be used to add additional words to the vocabulary of the recognizer. See [usage limits](https://cloud.google.com/speech-to-text/quotas#content). List items can also include pre-built or custom classes containing groups of words that represent common concepts that occur in natural language. For example, rather than providing a phrase hint for every month of the year (e.g. \"i was born in january\", \"i was born in febuary\", ...), use the pre-built `$MONTH` class improves the likelihood of correctly transcribing audio that includes months (e.g. \"i was born in $month\"). To refer to pre-built classes, use the class' symbol prepended with `$` e.g. `$MONTH`. To refer to custom classes that were defined inline in the request, set the class's `custom_class_id` to a string unique to all class resources and inline classes. Then use the class' id wrapped in $`{...}` e.g. \"${my-months}\". To refer to custom classes resources, use the class' id wrapped in `${}` (e.g. `${my-months}`).",
+            "description": "A phrases containing words and phrase \"hints\" so that the speech recognition is more likely to recognize them. This can be used to improve the accuracy for specific words and phrases, for example, if specific commands are typically spoken by the user. This can also be used to add additional words to the vocabulary of the recognizer. See [usage limits](https://cloud.google.com/speech-to-text/quotas#content). List items can also include pre-built or custom classes containing groups of words that represent common concepts that occur in natural language. For example, rather than providing a phrase hint for every month of the year (e.g. \"i was born in january\", \"i was born in febuary\", ...), use the pre-built `$MONTH` class improves the likelihood of correctly transcribing audio that includes months (e.g. \"i was born in $month\"). To refer to pre-built classes, use the class' symbol prepended with `$` e.g. `$MONTH`. To refer to custom classes that were defined inline in the request, set the class's `custom_class_id` to a string unique to all class resources and inline classes. Then use the class' id wrapped in $`{...}` e.g. \"${my-months}\". To refer to custom classes resources, use the class' id wrapped in `${}` (e.g. `${my-months}`). Speech-to-Text supports three locations: `global`, `us` (US North America), and `eu` (Europe). If you are calling the `speech.googleapis.com` endpoint, use the `global` location. To specify a region, use a [regional endpoint](/speech-to-text/docs/endpoints) with matching `us` or `eu` location value.",
            "id": "Phrase",
            "properties": {
                "boost": {
-                    "description": "Hint Boost. Overrides the boost set at the phrase set level. Positive value will increase the probability that a specific phrase will be recognized over other similar sounding phrases. The higher the boost, the higher the chance of false positive recognition as well. Negative boost values would correspond to anti-biasing. Anti-biasing is not enabled, so negative boost will simply be ignored. Though `boost` can accept a wide range of positive values, most use cases are best served with values between 0 and 20. We recommend using a binary search approach to finding the optimal value for your use case. Speech recognition will skip PhraseSets with a boost value of 0.",
+                    "description": "Hint Boost. Overrides the boost set at the phrase set level. Positive value will increase the probability that a specific phrase will be recognized over other similar sounding phrases. The higher the boost, the higher the chance of false positive recognition as well. Negative boost will simply be ignored. Though `boost` can accept a wide range of positive values, most use cases are best served with values between 0 and 20. We recommend using a binary search approach to finding the optimal value for your use case. Speech recognition will skip PhraseSets with a boost value of 0.",
                    "format": "float",
                    "type": "number"
                },
@@ -819,7 +843,7 @@
            "properties": {
                "adaptation": {
                    "$ref": "SpeechAdaptation",
-                    "description": "Speech adaptation configuration improves the accuracy of speech recognition. When speech adaptation is set it supersedes the `speech_contexts` field. For more information, see the [speech adaptation](https://cloud.google.com/speech-to-text/docs/adaptation) documentation."
+                    "description": "Speech adaptation configuration improves the accuracy of speech recognition. For more information, see the [speech adaptation](https://cloud.google.com/speech-to-text/docs/adaptation) documentation. When speech adaptation is set it supersedes the `speech_contexts` field."
                },
                "alternativeLanguageCodes": {
                    "description": "A list of up to 3 additional [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags, listing possible alternative languages of the supplied audio. See [Language Support](https://cloud.google.com/speech-to-text/docs/languages) for a list of the currently supported language codes. If alternative languages are listed, recognition result will contain recognition in the most likely language detected including the main language_code. The recognition result will include the language tag of the language detected in the audio. Note: This feature is only supported for Voice Command and Voice Search use cases and performance may vary for other use cases (e.g., phone call transcription).",
@@ -854,6 +878,14 @@
                    "description": "If 'true', enables speaker detection for each recognized word in the top alternative of the recognition result using a speaker_tag provided in the WordInfo. Note: Use diarization_config instead.",
                    "type": "boolean"
                },
+                "enableSpokenEmojis": {
+                    "description": "The spoken emoji behavior for the call If not set, uses default behavior based on model of choice If 'true', adds spoken emoji formatting for the request. This will replace spoken emojis with the corresponding Unicode symbols in the final transcript. If 'false', spoken emojis are not replaced.",
+                    "type": "boolean"
+                },
+                "enableSpokenPunctuation": {
+                    "description": "The spoken punctuation behavior for the call If not set, uses default behavior based on model of choice e.g. command_and_search will enable spoken punctuation by default If 'true', replaces spoken punctuation with the corresponding symbols in the request. For example, \"how are you question mark\" becomes \"how are you?\". See https://cloud.google.com/speech-to-text/docs/spoken-punctuation for support. If 'false', spoken punctuation is not replaced.",
+                    "type": "boolean"
+                },
                "enableWordConfidence": {
                    "description": "If `true`, the top result includes a list of words and the confidence for those words. If `false`, no word-level confidence information is returned. The default is `false`.",
                    "type": "boolean"
@@ -873,7 +905,8 @@
                        "AMR_WB",
                        "OGG_OPUS",
                        "SPEEX_WITH_HEADER_BYTE",
-                        "MP3"
+                        "MP3",
+                        "WEBM_OPUS"
                    ],
                    "enumDescriptions": [
                        "Not specified.",
@@ -884,7 +917,8 @@
                        "Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.",
                        "Opus encoded audio frames in Ogg container ([OggOpus](https://wiki.xiph.org/OggOpus)). `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000.",
                        "Although the use of lossy encodings is not recommended, if a very low bitrate encoding is required, `OGG_OPUS` is highly preferred over Speex encoding. The [Speex](https://speex.org/) encoding supported by Cloud Speech API has a header byte in each block, as in MIME type `audio/x-speex-with-header-byte`. It is a variant of the RTP Speex encoding defined in [RFC 5574](https://tools.ietf.org/html/rfc5574). The stream is a sequence of blocks, one block per RTP packet. Each block starts with a byte containing the length of the block, in bytes, followed by one or more frames of Speex data, padded to an integral number of bytes (octets) as specified in RFC 5574. In other words, each RTP header is replaced with a single byte containing the block length. Only Speex wideband is supported. `sample_rate_hertz` must be 16000.",
-                        "MP3 audio. MP3 encoding is a Beta feature and only available in v1p1beta1. Support all standard MP3 bitrates (which range from 32-320 kbps). When using this encoding, `sample_rate_hertz` has to match the sample rate of the file being used."
+                        "MP3 audio. MP3 encoding is a Beta feature and only available in v1p1beta1. Support all standard MP3 bitrates (which range from 32-320 kbps). When using this encoding, `sample_rate_hertz` has to match the sample rate of the file being used.",
+                        "Opus encoded audio frames in WebM container ([OggOpus](https://wiki.xiph.org/OggOpus)). `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000."
                    ],
                    "type": "string"
                },
@@ -902,7 +936,7 @@
                    "description": "Metadata regarding this request."
                },
                "model": {
-                    "description": "Which model to select for the given request. Select the model best suited to your domain to get best results. If a model is not explicitly specified, then we auto-select a model based on the parameters in the RecognitionConfig. *Model* *Description* command_and_search Best for short queries such as voice commands or voice search. phone_call Best for audio that originated from a phone call (typically recorded at an 8khz sampling rate). video Best for audio that originated from from video or includes multiple speakers. Ideally the audio is recorded at a 16khz or greater sampling rate. This is a premium model that costs more than the standard rate. default Best for audio that is not one of the specific audio models. For example, long-form audio. Ideally the audio is high-fidelity, recorded at a 16khz or greater sampling rate. ",
+                    "description": "Which model to select for the given request. Select the model best suited to your domain to get best results. If a model is not explicitly specified, then we auto-select a model based on the parameters in the RecognitionConfig. *Model* *Description* command_and_search Best for short queries such as voice commands or voice search. phone_call Best for audio that originated from a phone call (typically recorded at an 8khz sampling rate). video Best for audio that originated from video or includes multiple speakers. Ideally the audio is recorded at a 16khz or greater sampling rate. This is a premium model that costs more than the standard rate. default Best for audio that is not one of the specific audio models. For example, long-form audio. Ideally the audio is high-fidelity, recorded at a 16khz or greater sampling rate. medical_conversation Best for audio that originated from a conversation between a medical provider and patient. medical_dictation Best for audio that originated from dictation notes by a medical provider. ",
                    "type": "string"
                },
                "profanityFilter": {
@@ -921,6 +955,10 @@
                    },
                    "type": "array"
                },
+                "transcriptNormalization": {
+                    "$ref": "TranscriptNormalization",
+                    "description": "Use transcription normalization to automatically replace parts of the transcript with phrases of your choosing. For StreamingRecognize, this normalization only applies to stable partial transcripts (stability > 0.8) and final transcripts."
+                },
                "useEnhanced": {
                    "description": "Set to true to use an enhanced model for speech recognition. If `use_enhanced` is set to true and the `model` field is not set, then an appropriate enhanced model is chosen if an enhanced model exists for the audio. If `use_enhanced` is true and an enhanced version of the specified model does not exist, then the speech is recognized using the standard version of the specified model.",
                    "type": "boolean"
@@ -1060,6 +1098,11 @@
                        "$ref": "SpeechRecognitionResult"
                    },
                    "type": "array"
+                },
+                "totalBilledTime": {
+                    "description": "When available, billed audio seconds for the corresponding request.",
+                    "format": "google-duration",
+                    "type": "string"
                }
            },
            "type": "object"
@@ -1148,7 +1191,7 @@
                    "type": "number"
                },
                "transcript": {
-                    "description": "Transcript text representing the words that the user spoke.",
+                    "description": "Transcript text representing the words that the user spoke. In languages that use spaces to separate words, the transcript might have a leading space if it isn't the first result. You can concatenate each result to obtain the full transcript without using a separator.",
                    "type": "string"
                },
                "words": {
@@ -1181,6 +1224,11 @@
                    "description": "Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the language in this result. This language code was detected to have the most likelihood of being spoken in the audio.",
                    "readOnly": true,
                    "type": "string"
+                },
+                "resultEndTime": {
+                    "description": "Time offset of the end of this result relative to the beginning of the audio.",
+                    "format": "google-duration",
+                    "type": "string"
                }
            },
            "type": "object"
@@ -1212,6 +1260,20 @@
            },
            "type": "object"
        },
+        "TranscriptNormalization": {
+            "description": "Transcription normalization configuration. Use transcription normalization to automatically replace parts of the transcript with phrases of your choosing. For StreamingRecognize, this normalization only applies to stable partial transcripts (stability > 0.8) and final transcripts.",
+            "id": "TranscriptNormalization",
+            "properties": {
+                "entries": {
+                    "description": "A list of replacement entries. We will perform replacement with one entry at a time. For example, the second entry in [\"cat\" => \"dog\", \"mountain cat\" => \"mountain dog\"] will never be applied because we will always process the first entry before it. At most 100 entries.",
+                    "items": {
+                        "$ref": "Entry"
+                    },
+                    "type": "array"
+                }
+            },
+            "type": "object"
+        },
        "TranscriptOutputConfig": {
            "description": "Specifies an optional destination for the recognition results.",
            "id": "TranscriptOutputConfig",