Skip to content

Commit 59f4388

Browse files
author
Sean Trott
committed
speech
1 parent dfe2066 commit 59f4388

2 files changed

Lines changed: 210 additions & 0 deletions

File tree

speech.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#!/bin/bash
2+
python3 src/main/nluas/language/speechagent.py -asr=$asrdir
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
#!/usr/bin/env python
2+
######################################################################
3+
#
4+
# File: speechagent.py
5+
#
6+
# Initial Version: Jun 1, 2016 Adam Janin
7+
#
8+
# Record audio and call Kaldi to do speech recogntion. Package
9+
# resulting one-best text as as an ntuple, and sends it to the UI
10+
# agent.
11+
#
12+
# Only tested in python2.7, but it should work in python3.
13+
#
14+
# Currently hardwired to talk to "AgentUI", to name yourself
15+
# "SpeechAgent", and to call Kaldi in a particular way to get text.
16+
#
17+
# The program "rec" (from the sox package) must be on your path.
18+
# The program "online2-wav-nnet2-latgen-faster" (from the kaldi
19+
# package) must be on your path.
20+
#
21+
# Various ASR model files must exist in the directory specified
22+
# by -asr. See github wiki for instructions.
23+
#
24+
# The way Kaldi is called is very hacky. See comments below and
25+
# the wiki for details.
26+
#
27+
# Note: If it seems to hang after recording the audio, it's probably
28+
# a problem with Kaldi. Currently, there's very little error checking
29+
# on Kaldi's output. Search for "readline" in KaldiASR.__next__() for
30+
# where it's probably blocking.
31+
#
32+
33+
from __future__ import print_function
34+
35+
from six.moves import input
36+
import six
37+
38+
import argparse
39+
import os
40+
import re
41+
import shutil
42+
import subprocess
43+
import sys
44+
import tempfile
45+
46+
from nluas.core_agent import CoreAgent
47+
48+
VERSION = 0.1
49+
50+
class SpeechAgent(CoreAgent):
51+
def __init__(self, args):
52+
CoreAgent.__init__(self, args)
53+
self.ui_destination = "%s_%s"%(self.federation, "AgentUI")
54+
self.transport.subscribe(self.ui_destination, self.callback)
55+
# end __init__()
56+
# end class SpeechAgent
57+
58+
59+
#
60+
# Details on KaldiASR:
61+
#
62+
# Due to details in the Kaldi code, you need to repeatedly feed input
63+
# to the spk2utt specifier, not the wave specifier. Each utterance
64+
# needs a unique ID. Since the mapping from utterance to wave file
65+
# is read in toto, the code can only run for a fixed number of
66+
# utterances. When that number is reached, KaldiASR will restart.
67+
#
68+
# Once Kaldi is started, this program feeds it likes like:
69+
#
70+
# spk3 utt3
71+
#
72+
# When it receives a line like the above, it'll perform ASR on the
73+
# wavefile associated with "utt3". In KaldiASR, this will be a
74+
# file called input#.wav in a temporary directory. So to get the whole
75+
# thing to work, record audio to input#.wav, then write "spk# utt#" to
76+
# stdin of Kaldi, then read stdout of Kaldi looking for "utt#".
77+
#
78+
# For now, we just ignore the lattices and use the log file to extract
79+
# the one-best.
80+
#
81+
# Huge amounts of stuff are hard coded in KaldiASR. Should probably
82+
# switch to arguments of some sort.
83+
#
84+
# This is all pretty hacky, but I want to avoid forking the Kaldi
85+
# decoder if I can.
86+
87+
class KaldiASR(six.Iterator):
88+
'''Start up a Kaldi recognizer as an iterator. Return the one-best each time next() is called. Stop when user presses q or an error occurs.'''
89+
90+
def __init__(self, asrdir='/t/janin/ecg/asr'):
91+
92+
# Path to where ASR models and other required files are stored.
93+
self.asrdir = asrdir
94+
95+
# Make sure required files exist.
96+
for f in ('HCLG.fst', 'final.mdl', 'mfcc.conf', 'online_nnet2_decoding.conf', 'words.txt'):
97+
if not os.path.exists(os.path.join(asrdir, f)):
98+
raise Exception('speechagent: Unable to locate file "%s" in asr directory "%s".\n'%(f, asrdir))
99+
100+
# Unique uttid
101+
self.uttid = 0
102+
103+
# Where to store temporary files
104+
self.tmpdir = tempfile.mkdtemp(prefix='speechagent.')
105+
106+
# Command to perform speech recognition. See comments above
107+
# for details on how this works. Quoting is tricky...
108+
self.kaldicmd = 'online2-wav-nnet2-latgen-faster --print-args=false --online=false --do-endpointing=false --config=%s/online_nnet2_decoding.conf --max-active=7000 --beam=15.0 --lattice-beam=6.0 --acoustic-scale=0.1 --word-symbol-table=%s/words.txt %s/final.mdl %s/HCLG.fst ark:- "scp:for i in {0..9999}; do printf \'utt%%d %s/input%%d.wav\\n\' \$i \$i; done|" ark:/dev/null'%(self.asrdir, self.asrdir, self.asrdir, self.asrdir, self.tmpdir)
109+
110+
# Command to record audio file. This uses sox, records until
111+
# it hears a run of silence, and also removes silences at the
112+
# start end.
113+
#
114+
# Note that %s will be replaced later with the path to
115+
# the audio file. If you want %'s literally in the cmd,
116+
# they need to be replaced with %% so that the later python
117+
# format command doesn't interpret them as arguments.
118+
self.reccmd = 'rec -V0 -c 1 -b 16 -r 16k -q %s silence 1 0.0 0%% 1 2.0 0.9%%'
119+
# The running kaldi process. Set in start_kaldi()
120+
self.kaldiproc = None
121+
122+
self.start_kaldi()
123+
# end __init__()
124+
125+
def start_kaldi(self):
126+
'''Start or restart the kaldi process.'''
127+
self.uttid = 0
128+
if self.kaldiproc is not None:
129+
self.kaldiproc.terminate()
130+
131+
self.kaldiproc = subprocess.Popen(self.kaldicmd, bufsize=1, shell=True, universal_newlines=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
132+
# end start_kaldi()
133+
134+
def __del__(self):
135+
'''When iterator goes out of scope, clean temporary directory and stop kaldi process.'''
136+
shutil.rmtree(self.tmpdir)
137+
if self.kaldiproc is not None:
138+
self.kaldiproc.terminate()
139+
140+
def __iter__(self):
141+
return self
142+
143+
def __next__(self):
144+
'''Call "rec" to record a new audio file, then call Kaldi to do ASR on it. Return the one-best transcription.'''
145+
146+
# Restart if uttid is too high. Should match the for loop in self.kaldicmd.
147+
if self.uttid >= 9999:
148+
self.start_kaldi()
149+
150+
print("Press Enter to record (or q to quit)")
151+
if (input() == 'q'):
152+
raise StopIteration()
153+
154+
print("Recording")
155+
# Record the audio to input.wav
156+
audiopath = "%s/input%d.wav"%(self.tmpdir, self.uttid)
157+
subprocess.check_call(self.reccmd%(audiopath), shell=True)
158+
159+
print("Running speech recognition")
160+
# Send string to kaldi that'll cause it to do ASR.
161+
self.kaldiproc.stdin.write('spkr%d utt%d\n'%(self.uttid, self.uttid))
162+
self.kaldiproc.stdin.flush()
163+
164+
# Kaldi should only output the transcript and a log message.
165+
# Note that readline() will block waiting for Kaldi.
166+
# We should probably do timeouts and more error checking
167+
# here.
168+
169+
while True:
170+
line = self.kaldiproc.stderr.readline()
171+
print(line)
172+
# If the above is blocking, it means kaldi
173+
# never output the expected output line.
174+
# To debug, uncomment the following line:
175+
#print('speechagent debug:\n',line)
176+
177+
# Check for a line starting with utt# and return it.
178+
# Other lines are silently ignored.
179+
m = re.match('utt%d (.*)$'%(self.uttid), line)
180+
if m is not None:
181+
os.remove(audiopath)
182+
self.uttid += 1
183+
return m.group(1).strip()
184+
elif not re.match("\s*$", line) and not re.match('LOG \(online2-wav-nnet2-latgen-faster:main\(\):online2-wav-nnet2-latgen-faster\.cc.*Decoded utterance utt', line):
185+
sys.stderr.write('Unexpected line from Kaldi:\n'+line)
186+
# end __next__()
187+
#end class KaldiASR
188+
189+
def main(argv):
190+
parser = argparse.ArgumentParser()
191+
parser.add_argument('-asr', required=True, help='Path to speech recognition model files.')
192+
args = parser.parse_args(argv[1:])
193+
194+
speechagent = SpeechAgent(['SpeechAgent'])
195+
196+
for asrresult in KaldiASR(args.asr):
197+
if asrresult != "":
198+
print("Heard:", asrresult, "\n")
199+
ntuple = {'type': 'speech', 'text': asrresult}
200+
speechagent.transport.send(speechagent.ui_destination, ntuple)
201+
else:
202+
print("Speech recognition failed")
203+
speechagent.transport.quit_federation()
204+
# end main()
205+
206+
207+
if __name__ == "__main__":
208+
main(sys.argv)

0 commit comments

Comments
 (0)