Skip to content

Commit 885f680

Browse files
ehdsouzagermanattanasio
authored andcommitted
feat(SpeechToText): Add support for Web Sockets (#376)
* Initial implementation of set using web sockets * Speech to text using web sockets * fix: Adding the pyOpenSSL * docs: Adding IBM copyright * refactor: Following clean coding practices better naming conventions, added method description in the method body * refactor(Better naming conventions): * refactor: removing the extra close signal * refactor: naming conventions, error message * Refactor: remove print * feat: Adding microphone example for speech to text
1 parent 8dfc0f9 commit 885f680

9 files changed

Lines changed: 445 additions & 4 deletions

File tree

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from __future__ import print_function
2+
import pyaudio
3+
import tempfile
4+
from watson_developer_cloud import SpeechToTextV1
5+
from watson_developer_cloud.websocket import RecognizeCallback, RecognizeListener
6+
7+
speech_to_text = SpeechToTextV1(
8+
username='YOUR SERVICE USERNAME',
9+
password='YOUR SERVICE PASSWORD',
10+
url='https://stream.watsonplatform.net/speech-to-text/api')
11+
12+
# Example using websockets
13+
class MyRecognizeCallback(RecognizeCallback):
14+
def __init__(self):
15+
pass
16+
17+
def on_transcription(self, transcript):
18+
print(transcript)
19+
20+
def on_connected(self):
21+
print('Connection was successful')
22+
23+
def on_error(self, error):
24+
print('Error received: {}'.format(error))
25+
26+
def on_inactivity_timeout(self, error):
27+
print('Inactivity timeout: {}'.format(error))
28+
29+
def on_listening(self):
30+
print('Service is listening')
31+
32+
def on_transcription_complete(self):
33+
print('Transcription completed')
34+
35+
def on_hypothesis(self, hypothesis):
36+
print(hypothesis)
37+
38+
mycallback = MyRecognizeCallback()
39+
tmp = tempfile.NamedTemporaryFile()
40+
41+
FORMAT = pyaudio.paInt16
42+
CHANNELS = 1
43+
RATE = 44100
44+
CHUNK = 1024
45+
RECORD_SECONDS = 5
46+
47+
audio = pyaudio.PyAudio()
48+
stream = audio.open(format=FORMAT, channels=CHANNELS,
49+
rate=RATE, input=True,
50+
frames_per_buffer=CHUNK)
51+
52+
print('recording....')
53+
with open(tmp.name, 'w') as f:
54+
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
55+
data = stream.read(CHUNK)
56+
f.write(data)
57+
58+
stream.stop_stream()
59+
stream.close()
60+
audio.terminate()
61+
print('Done recording...')
62+
63+
with open(tmp.name) as f:
64+
speech_to_text.recognize_with_websocket(audio=f, recognize_callback=mycallback)

examples/speech_to_text_v1.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
import json
33
from os.path import join, dirname
44
from watson_developer_cloud import SpeechToTextV1
5+
from watson_developer_cloud.websocket import RecognizeCallback, RecognizeListener
56

67
speech_to_text = SpeechToTextV1(
78
username='YOUR SERVICE USERNAME',
8-
password='YOUR SERVICE PASSWORD')
9+
password='YOUR SERVICE PASSWORD',
10+
url='https://stream.watsonplatform.net/speech-to-text/api')
911

1012
print(json.dumps(speech_to_text.list_models(), indent=2))
1113

@@ -21,3 +23,35 @@
2123
timestamps=True,
2224
word_confidence=True),
2325
indent=2))
26+
27+
# Example using websockets
28+
class MyRecognizeCallback(RecognizeCallback):
29+
def __init__(self):
30+
pass
31+
32+
def on_transcription(self, transcript):
33+
print(transcript)
34+
35+
def on_connected(self):
36+
print('Connection was successful')
37+
38+
def on_error(self, error):
39+
print('Error received: {}'.format(error))
40+
41+
def on_inactivity_timeout(self, error):
42+
print('Inactivity timeout: {}'.format(error))
43+
44+
def on_listening(self):
45+
print('Service is listening')
46+
47+
def on_transcription_complete(self):
48+
print('Transcription completed')
49+
50+
def on_hypothesis(self, hypothesis):
51+
print(hypothesis)
52+
53+
mycallback = MyRecognizeCallback()
54+
with open(join(dirname(__file__), '../resources/speech.wav'),
55+
'rb') as audio_file:
56+
speech_to_text.recognize_with_websocket(
57+
audio=audio_file, recognize_callback=mycallback)

requirements-dev.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,9 @@ pytest-cov>=2.2.1
1515
recommonmark>=0.2.0
1616
Sphinx>=1.3.1
1717
bumpversion>=0.5.3
18+
19+
# Web sockets
20+
autobahn>=0.10.9
21+
Twisted>=13.2.0
22+
pyOpenSSL>=16.2.0
23+
service-identity>=17.0.0

requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
11
requests>=2.0,<3.0
22
python_dateutil>=2.5.3
3+
autobahn>=0.10.9
4+
Twisted>=13.2.0
5+
pyOpenSSL>=16.2.0
6+
service-identity>=17.0.0

test/integration/test_examples.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from glob import glob
1111

1212
# tests to exclude
13-
excludes = ['authorization_v1.py', 'discovery_v1.ipynb', '__init__.py']
13+
excludes = ['authorization_v1.py', 'discovery_v1.ipynb', '__init__.py', 'microphone-speech-to-text.py']
1414

1515
# examples path. /examples
1616
examples_path = join(dirname(__file__), '../', 'examples', '*.py')

watson_developer_cloud/speech_to_text_v1.py

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,12 @@
2525
from __future__ import absolute_import
2626

2727
import json
28-
from .watson_service import WatsonService
28+
from .watson_service import WatsonService, _remove_null_values
2929
from .utils import deprecated
30-
30+
from watson_developer_cloud.websocket import RecognizeCallback, RecognizeListener
31+
from os.path import isfile
32+
import base64
33+
import urllib
3134
##############################################################################
3235
# Service
3336
##############################################################################
@@ -183,6 +186,96 @@ def recognize(self,
183186
accept_json=True)
184187
return response
185188

189+
def recognize_with_websocket(self,
190+
audio=None,
191+
content_type='audio/l16; rate=44100',
192+
model='en-US_BroadbandModel',
193+
recognize_callback=None,
194+
customization_id=None,
195+
acoustic_customization_id=None,
196+
customization_weight=None,
197+
version=None,
198+
inactivity_timeout=None,
199+
interim_results=True,
200+
keywords=None,
201+
keywords_threshold=None,
202+
max_alternatives=1,
203+
word_alternatives_threshold=None,
204+
word_confidence=False,
205+
timestamps=False,
206+
profanity_filter=None,
207+
smart_formatting=False,
208+
speaker_labels=None):
209+
"""
210+
Sends audio for speech recognition using web sockets.
211+
212+
:param str audio: Audio to transcribe in the format specified by the `Content-Type` header.
213+
:param str content_type: The type of the input: audio/basic, audio/flac, audio/l16, audio/mp3, audio/mpeg, audio/mulaw, audio/ogg, audio/ogg;codecs=opus, audio/ogg;codecs=vorbis, audio/wav, audio/webm, audio/webm;codecs=opus, audio/webm;codecs=vorbis, or multipart/form-data.
214+
:param str model: The identifier of the model to be used for the recognition request.
215+
:param RecognizeCallback recognize_callback: The instance handling events returned from the service.
216+
:param str customization_id: The GUID of a custom language model that is to be used with the request. The base model of the specified custom language model must match the model specified with the `model` parameter. You must make the request with service credentials created for the instance of the service that owns the custom model. By default, no custom language model is used.
217+
:param str acoustic_customization_id: The GUID of a custom acoustic model that is to be used with the request. The base model of the specified custom acoustic model must match the model specified with the `model` parameter. You must make the request with service credentials created for the instance of the service that owns the custom model. By default, no custom acoustic model is used.
218+
:param float customization_weight: If you specify a `customization_id` with the request, you can use the `customization_weight` parameter to tell the service how much weight to give to words from the custom language model compared to those from the base model for speech recognition. Specify a value between 0.0 and 1.0. Unless a different customization weight was specified for the custom model when it was trained, the default value is 0.3. A customization weight that you specify overrides a weight that was specified when the custom model was trained. The default value yields the best performance in general. Assign a higher value if your audio makes frequent use of OOV words from the custom model. Use caution when setting the weight: a higher value can improve the accuracy of phrases from the custom model's domain, but it can negatively affect performance on non-domain phrases.
219+
:param str version: The version of the specified base `model` that is to be used for speech recognition. Multiple versions of a base model can exist when a model is updated for internal improvements. The parameter is intended primarily for use with custom models that have been upgraded for a new base model. The default value depends on whether the parameter is used with or without a custom model. For more information, see [Base model version](https://console.bluemix.net/docs/services/speech-to-text/input.html#version).
220+
:param int inactivity_timeout: The time in seconds after which, if only silence (no speech) is detected in submitted audio, the connection is closed with a 400 error. Useful for stopping audio submission from a live microphone when a user simply walks away. Use `-1` for infinity.
221+
:param bool interim_results: Send back non-final previews of each "sentence" as it is being processed. These results are ignored in text mode.
222+
:param list[str] keywords: Array of keyword strings to spot in the audio. Each keyword string can include one or more tokens. Keywords are spotted only in the final hypothesis, not in interim results. If you specify any keywords, you must also specify a keywords threshold. Omit the parameter or specify an empty array if you do not need to spot keywords.
223+
:param float keywords_threshold: Confidence value that is the lower bound for spotting a keyword. A word is considered to match a keyword if its confidence is greater than or equal to the threshold. Specify a probability between 0 and 1 inclusive. No keyword spotting is performed if you omit the parameter. If you specify a threshold, you must also specify one or more keywords.
224+
:param int max_alternatives: Maximum number of alternative transcripts to be returned. By default, a single transcription is returned.
225+
:param float word_alternatives_threshold: Confidence value that is the lower bound for identifying a hypothesis as a possible word alternative (also known as \"Confusion Networks\"). An alternative word is considered if its confidence is greater than or equal to the threshold. Specify a probability between 0 and 1 inclusive. No alternative words are computed if you omit the parameter.
226+
:param bool word_confidence: If `true`, confidence measure per word is returned.
227+
:param bool timestamps: If `true`, time alignment for each word is returned.
228+
:param bool profanity_filter: If `true` (the default), filters profanity from all output except for keyword results by replacing inappropriate words with a series of asterisks. Set the parameter to `false` to return results with no censoring. Applies to US English transcription only.
229+
:param bool smart_formatting: If `true`, converts dates, times, series of digits and numbers, phone numbers, currency values, and Internet addresses into more readable, conventional representations in the final transcript of a recognition request. If `false` (the default), no formatting is performed. Applies to US English transcription only.
230+
:param bool speaker_labels: Indicates whether labels that identify which words were spoken by which participants in a multi-person exchange are to be included in the response. The default is `false`; no speaker labels are returned. Setting `speaker_labels` to `true` forces the `timestamps` parameter to be `true`, regardless of whether you specify `false` for the parameter. To determine whether a language model supports speaker labels, use the `GET /v1/models` method and check that the attribute `speaker_labels` is set to `true`. You can also refer to [Speaker labels](https://console.bluemix.net/docs/services/speech-to-text/output.html#speaker_labels).
231+
:return:
232+
"""
233+
if audio is None:
234+
raise ValueError('Audio must be provided')
235+
if recognize_callback is None:
236+
raise ValueError('Recognize callback must be provided')
237+
if not isinstance(recognize_callback, RecognizeCallback):
238+
raise Exception('Callback is not a derived class of RecognizeCallback')
239+
240+
headers = {}
241+
if self.default_headers is not None:
242+
headers = self.default_headers.copy()
243+
base64_authorization = base64.b64encode(self.username + ':' + self.password)
244+
headers['Authorization'] = 'Basic {0}'.format(base64_authorization)
245+
246+
url = self.url.replace('https:', 'wss:')
247+
params = {
248+
'model': model,
249+
'customization_id': customization_id,
250+
'acoustic_customization_id': acoustic_customization_id,
251+
'customization_weight': customization_weight,
252+
'version': version
253+
}
254+
params = _remove_null_values(params)
255+
url = url + '/v1/recognize?{0}'.format(urllib.urlencode(params))
256+
257+
options = {
258+
'content_type': content_type,
259+
'inactivity_timeout': inactivity_timeout,
260+
'interim_results': interim_results,
261+
'keywords': keywords,
262+
'keywords_threshold': keywords_threshold,
263+
'max_alternatives': max_alternatives,
264+
'word_alternatives_threshold': word_alternatives_threshold,
265+
'word_confidence': word_confidence,
266+
'timestamps': timestamps,
267+
'profanity_filter': profanity_filter,
268+
'smart_formatting': smart_formatting,
269+
'speaker_labels': speaker_labels
270+
}
271+
options = _remove_null_values(options)
272+
273+
recognizeListener = RecognizeListener(audio,
274+
options,
275+
recognize_callback,
276+
url,
277+
headers)
278+
186279
#########################
187280
# asynchronous
188281
#########################
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# coding: utf-8
2+
3+
# Copyright 2018 IBM All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from .recognize_abstract_callback import RecognizeCallback
18+
from .speech_to_text_websocket_listener import RecognizeListener
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# coding: utf-8
2+
3+
# Copyright 2018 IBM All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
class RecognizeCallback:
18+
19+
def on_transcription(self, transcript):
20+
"""
21+
Called when an interim result is received
22+
"""
23+
pass
24+
25+
def on_connected(self):
26+
"""
27+
Called when a WebSocket connection was made
28+
"""
29+
pass
30+
31+
def on_error(self, error):
32+
"""
33+
Called when there is an error in the Web Socket connection.
34+
"""
35+
pass
36+
37+
def on_inactivity_timeout(self):
38+
"""
39+
Called when there is an inactivity timeout.
40+
"""
41+
pass
42+
43+
def on_listening(self):
44+
"""
45+
Called when the service is listening for audio.
46+
"""
47+
pass
48+
49+
def on_transcription_complete(self):
50+
"""
51+
Called after the service returns the final result for the transcription.
52+
"""
53+
pass
54+
55+
def on_hypothesis(self, hypothesis):
56+
"""
57+
Called when the service returns the final hypothesis
58+
"""
59+
pass

0 commit comments

Comments
 (0)