Python Speech recognition for Mac OS X
Hi Folks,
But I made it working for the Mac OS X by using flac.
Following are the instructions to make it run and modified code,
Prerequisite:
I have been been searching for the Python Speech recognition package especially for Mac OS.
I am not happy with the search results.
Finally I got https://pypi.python.org/pypi/SpeechRecognition/
(SpeechRecognition works only for the Linux distributions and Windows.)
(SpeechRecognition works only for the Linux distributions and Windows.)
But I made it working for the Mac OS X by using flac.
Following are the instructions to make it run and modified code,
Prerequisite:
- Install Xcode in Unix Development mode
- Install Xcode - Command Line tools (Type 'xcode-select —install' in the terminal )
- Install Port or Brew (port [or] Brew )
- sudo brew install portaudio [or] sudo port install portaudio (for loading mic drivers)
- sudo pip install pyaudio [or] sudo easy_install install pyaudio (Mic Sources)
- sudo pip install SpeechRecognition [or] sudo easy_install install SpeechRecognition
-
sudo port install flac
Once you have finished with install, these are
- mdfind flac | grep -i 'bin' - Copy the location of flac installed
- sudo ln -s /opt/local/bin/flac /Library/Python/2.7/site-packages/speech_recognition/flac-mac - Create soft link to access the flac binary
Following code is the modified version of __init__.py made for Mac OS X.
Make a backup of the previous code.
"""Library for performing speech recognition with the Google Speech Recognition API.""" """Library for performing speech recognition with the Google Speech Recognition API.""" __author__ = 'Anthony Zhang (Uberi)' __version__ = '1.1.2' __license__ = 'BSD' import io, os, subprocess, wave import math, audioop, collections import json, platform, time try: # try to use python2 module from urllib2 import Request, urlopen except ImportError: # otherwise, use python3 module from urllib.request import Request, urlopen #wip: filter out clicks and other too short parts class AudioSource(object): def __init__(self): raise NotImplementedError("this is an abstract class") def __enter__(self): raise NotImplementedError("this is an abstract class") def __exit__(self, exc_type, exc_value, traceback): raise NotImplementedError("this is an abstract class") try: import pyaudio class Microphone(AudioSource): def __init__(self, device_index = None): self.device_index = device_index self.format = pyaudio.paInt16 # 16-bit int sampling self.SAMPLE_WIDTH = pyaudio.get_sample_size(self.format) self.RATE = 16000 # sampling rate in Hertz self.CHANNELS = 1 # mono audio self.CHUNK = 1024 # number of frames stored in each buffer self.audio = None self.stream = None def __enter__(self): self.audio = pyaudio.PyAudio() self.stream = self.audio.open( input_device_index = self.device_index, format = self.format, rate = self.RATE, channels = self.CHANNELS, frames_per_buffer = self.CHUNK, input = True, # stream is an input stream ) return self def __exit__(self, exc_type, exc_value, traceback): self.stream.stop_stream() self.stream.close() self.stream = None self.audio.terminate() except ImportError: pass class WavFile(AudioSource): def __init__(self, filename_or_fileobject): if isinstance(filename_or_fileobject, str): self.filename = filename_or_fileobject else: self.filename = None self.wav_file = filename_or_fileobject self.stream = None def __enter__(self): if self.filename: self.wav_file = open(self.filename, "rb") self.wav_reader = wave.open(self.wav_file, "rb") self.SAMPLE_WIDTH = self.wav_reader.getsampwidth() self.RATE = self.wav_reader.getframerate() self.CHANNELS = self.wav_reader.getnchannels() assert self.CHANNELS == 1 # audio must be mono self.CHUNK = 4096 self.stream = WavFile.WavStream(self.wav_reader) return self def __exit__(self, exc_type, exc_value, traceback): if self.filename: self.wav_file.close() self.stream = None class WavStream(object): def __init__(self, wav_reader): self.wav_reader = wav_reader def read(self, size = -1): if size == -1: return self.wav_reader.readframes(self.wav_reader.getnframes()) return self.wav_reader.readframes(size) class AudioData(object): def __init__(self, rate, data): self.rate = rate self.data = data class Recognizer(AudioSource): def __init__(self, language = "en-US", key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"): self.key = key self.language = language self.energy_threshold = 100 # minimum audio energy to consider for recording self.pause_threshold = 0.8 # seconds of quiet time before a phrase is considered complete self.quiet_duration = 0.5 # amount of quiet time to keep on both sides of the recording def samples_to_flac(self, source, frame_data): import platform, os with io.BytesIO() as wav_file: wav_writer = wave.open(wav_file, "wb") try: wav_writer.setsampwidth(source.SAMPLE_WIDTH) wav_writer.setnchannels(source.CHANNELS) wav_writer.setframerate(source.RATE) wav_writer.writeframes(frame_data) finally: # make sure resources are cleaned up wav_writer.close() wav_data = wav_file.getvalue() # determine which converter executable to use system = platform.system() path = os.path.dirname(os.path.abspath(__file__)) # directory of the current module file, where all the FLAC bundled binaries are stored flac_converter = shutil_which("flac") # check for installed version first if flac_converter is None: # flac utility is not installed if system == "Windows" and platform.machine() in {"i386", "x86", "x86_64", "AMD64"}: # Windows NT, use the bundled FLAC conversion utility flac_converter = os.path.join(path, "flac-win32.exe") elif system == "Linux" and platform.machine() in {"i386", "x86", "x86_64", "AMD64"}: flac_converter = os.path.join(path, "flac-linux-i386") elif system == 'Darwin': # HERE IS THE CHANGE WE NEED TO SOLVE IT flac_converter = os.path.join(path, "flac-mac") else: raise ChildProcessError("FLAC conversion utility not available - consider installing the FLAC command line application") process = subprocess.Popen("\"%s\" --stdout --totally-silent --best -" % flac_converter, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) flac_data, stderr = process.communicate(wav_data) return flac_data def record(self, source, duration = None): assert isinstance(source, AudioSource) and source.stream frames = io.BytesIO() seconds_per_buffer = (source.CHUNK + 0.0) / source.RATE elapsed_time = 0 while True: # loop for the total number of chunks needed elapsed_time += seconds_per_buffer if duration and elapsed_time > duration: break buffer = source.stream.read(source.CHUNK) if len(buffer) == 0: break frames.write(buffer) frame_data = frames.getvalue() frames.close() return AudioData(source.RATE, self.samples_to_flac(source, frame_data)) def listen(self, source, timeout = None): assert isinstance(source, AudioSource) and source.stream # record audio data as raw samples frames = collections.deque() assert self.pause_threshold >= self.quiet_duration >= 0 seconds_per_buffer = (source.CHUNK + 0.0) / source.RATE pause_buffer_count = int(math.ceil(self.pause_threshold / seconds_per_buffer)) # number of buffers of quiet audio before the phrase is complete quiet_buffer_count = int(math.ceil(self.quiet_duration / seconds_per_buffer)) # maximum number of buffers of quiet audio to retain before and after elapsed_time = 0 # store audio input until the phrase starts while True: elapsed_time += seconds_per_buffer if timeout and elapsed_time > timeout: # handle timeout if specified raise TimeoutError("listening timed out") buffer = source.stream.read(source.CHUNK) if len(buffer) == 0: break # reached end of the stream frames.append(buffer) # check if the audio input has stopped being quiet energy = audioop.rms(buffer, source.SAMPLE_WIDTH) # energy of the audio signal if energy > self.energy_threshold: break if len(frames) > quiet_buffer_count: # ensure we only keep the needed amount of quiet buffers frames.popleft() # read audio input until the phrase ends pause_count = 0 while True: buffer = source.stream.read(source.CHUNK) if len(buffer) == 0: break # reached end of the stream frames.append(buffer) # check if the audio input has gone quiet for longer than the pause threshold energy = audioop.rms(buffer, source.SAMPLE_WIDTH) # energy of the audio signal if energy > self.energy_threshold: pause_count = 0 else: pause_count += 1 if pause_count > pause_buffer_count: # end of the phrase break # obtain frame data for i in range(quiet_buffer_count, pause_buffer_count): frames.pop() # remove extra quiet frames at the end frame_data = b"".join(list(frames)) return AudioData(source.RATE, self.samples_to_flac(source, frame_data)) def recognize(self, audio_data, show_all = False): assert isinstance(audio_data, AudioData) url = "http://www.google.com/speech-api/v2/recognize?client=chromium&lang=%s&key=%s" % (self.language, self.key) self.request = Request(url, data = audio_data.data, headers = {"Content-Type": "audio/x-flac; rate=%s" % audio_data.rate}) # check for invalid key response from the server try: response = urlopen(self.request) except: raise KeyError("Server wouldn't respond (invalid key or quota has been maxed out)") response_text = response.read().decode("utf-8") # ignore any blank blocks actual_result = [] for line in response_text.split("\n"): if not line: continue result = json.loads(line)["result"] if len(result) != 0: actual_result = result[0] # make sure we have a list of transcriptions if "alternative" not in actual_result: raise LookupError("Speech is unintelligible") # return the best guess unless told to do otherwise if not show_all: for prediction in actual_result["alternative"]: if "confidence" in prediction: return prediction["transcript"] raise LookupError("Speech is unintelligible") spoken_text = [] # check to see if Google thinks it's 100% correct default_confidence = 0 if len(actual_result["alternative"])==1: default_confidence = 1 # return all the possibilities for prediction in actual_result["alternative"]: if "confidence" in prediction: spoken_text.append({"text":prediction["transcript"],"confidence":prediction["confidence"]}) else: spoken_text.append({"text":prediction["transcript"],"confidence":default_confidence}) return spoken_text # helper functions def shutil_which(pgm): """ python2 backport of python3's shutil.which() """ path = os.getenv('PATH') for p in path.split(os.path.pathsep): p = os.path.join(p, pgm) if os.path.exists(p) and os.access(p, os.X_OK): return p if __name__ == "__main__": # To identify the Mac OS system = platform.system() if system == 'Darwin': mac = True else: mac = False r = Recognizer() m = Microphone() while True: print("Say something!") if mac: os.system("say Say Something") time.sleep(0.5) with m as source: audio = r.listen(source) print("Got it! Now to recognize it...") if mac: os.system("say -v victoria Got it! Now to recognize it") time.sleep(0.75) try: text = r.recognize(audio) print("You said " + text) if mac: os.system("say -v vicki "+ text) time.sleep(1) if text == 'exit': print "I am gonna EXIT bye bye" if mac: os.system("say I am gonna exit Bye Bye ") time.sleep(0.5) exit() except LookupError: print("Oops! Didn't catch that") if mac: os.system("say -v Alex Oops Didnt catch that") time.sleep(1)
wow. works great. thank you.
ReplyDeleteWow, Happy it works for you :)
ReplyDeleteThis comment has been removed by the author.
ReplyDeletehi,works great,but it is very slow, how to increase the speed ?? coz i want to use in a interactive environment. pls help
ReplyDelete