百度语音现在是比较方便的接口,具体说明请看官方文档,本文分两个部分,先是使用python实现录音,然后再使用百度语音api进行识别上传。
首先是实现录音功能,因为百度语言识别有一些录音品质的要求的。so。。下文的代码可以按时间为文件名生成录音文件,产生一个gui界面,点击按钮后开始录音。
百度语音REST API支持整段录音文件的识别,对录音格式有一定的要求,支持语音识别控件:集成提示音、音量反馈动效整套交互的对话框控件,方便开发者快速集成;
原始PCM的录音参数必须符合8k/16k采样率、16bit位深、单声道,支持的压缩格式有:pcm(不压缩)、wav、opus、speex、amr、x-flac。
语音识别接口支持POST 方式 目前API仅支持整段语音识别的模式,即需要上传整段语音进行识别 语音数据上传方式有两种:隐示发送和显示发送 原始语音的录音格式目前只支持评测8k/16k采样率16bit位深的单声道语音 压缩格式支持:pcm(不压缩)、wav、opus、speex、amr、x-flac 系统支持语言种类:中文(zh)、粤语(ct)、英文(en)
Python
#!usr/bin/env python #coding=utf-8 import numpy as np from pyaudio import PyAudio,paInt16 from datetime import datetime import wave from Tkinter import * #define of params NUM_SAMPLES = 2000 framerate = 8000 channels = 1 sampwidth = 2 #record time TIME = 10 def save_wave_file(filename, data): '''save the date to the wav file''' wf = wave.open(filename, 'wb') wf.setnchannels(channels) wf.setsampwidth(sampwidth) wf.setframerate(framerate) wf.writeframes("".join(data)) wf.close() def my_button(root,label_text,button_text,button_func): '''''function of creat label and button''' #label details label = Label(root) label['text'] = label_text label.pack() #label details button = Button(root) button['text'] = button_text button['command'] = button_func button.pack() def record_wave(): #open the input of wave pa = PyAudio() stream = pa.open(format = paInt16, channels = 1, rate = framerate, input = True, frames_per_buffer = NUM_SAMPLES) save_buffer = [] count = 0 while count < TIME*4: #read NUM_SAMPLES sampling data string_audio_data = stream.read(NUM_SAMPLES) save_buffer.append(string_audio_data) count = 1 print '.' filename = datetime.now().strftime("%Y-%m-%d_%H_%M_%S") ".wav" save_wave_file(filename, save_buffer) save_buffer = [] print filename, "saved" def main(): root = Tk() my_button(root,"Record a wave","clik to record",record_wave) root.mainloop() if __name__ == "__main__": main()
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465 | #!usr/bin/env python#coding=utf-8 import numpy as npfrom pyaudio import PyAudio,paInt16from datetime import datetimeimport wavefrom Tkinter import * #define of paramsNUM_SAMPLES = 2000framerate = 8000channels = 1sampwidth = 2#record timeTIME = 10 def save_wave_file(filename, data): '''save the date to the wav file''' wf = wave.open(filename, 'wb') wf.setnchannels(channels) wf.setsampwidth(sampwidth) wf.setframerate(framerate) wf.writeframes("".join(data)) wf.close() def my_button(root,label_text,button_text,button_func): '''''function of creat label and button''' #label details label = Label(root) label['text'] = label_text label.pack() #label details button = Button(root) button['text'] = button_text button['command'] = button_func button.pack() def record_wave(): #open the input of wave pa = PyAudio() stream = pa.open(format = paInt16, channels = 1, rate = framerate, input = True, frames_per_buffer = NUM_SAMPLES) save_buffer = [] count = 0 while count < TIME*4: #read NUM_SAMPLES sampling data string_audio_data = stream.read(NUM_SAMPLES) save_buffer.append(string_audio_data) count = 1 print '.' filename = datetime.now().strftime("%Y-%m-%d_%H_%M_%S") ".wav" save_wave_file(filename, save_buffer) save_buffer = [] print filename, "saved" def main(): root = Tk() my_button(root,"Record a wave","clik to record",record_wave) root.mainloop() if __name__ == "__main__": main() |
---|
完成录音后看文件目录是否已经出现一个。wav格式的文件了呢。一次录音大概是十秒钟。然后修改文件名为1.wav
执行下面的程序。有部分需要按照你的id和key进行修改噢。
Python
#encoding=utf-8 import wave import urllib, urllib2, pycurl import base64 import json ## get access token by api key & secret key ## 获得token,需要填写你的apikey以及secretkey def get_token(): apiKey = "Ll0c53MSac6GBOtpg22ZSGAU**" secretKey = "44c8af396038a24e34936227d4a19dc2**" auth_url = "https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=" apiKey "&client_secret=" secretKey; res = urllib2.urlopen(auth_url) json_data = res.read() return json.loads(json_data)['access_token'] def dump_res(buf): print (buf) ## post audio to server def use_cloud(token): fp = wave.open('1.wav', 'rb')#录音文件名 ##已经录好音的语音片段 nf = fp.getnframes() f_len = nf * 2 audio_data = fp.readframes(nf) cuid = "7519663**" #你的产品id srv_url = 'http://vop.baidu.com/server_api' '?cuid=' cuid '&token=' token http_header = [ 'Content-Type: audio/pcm; rate=8000', 'Content-Length: %d' % f_len ] c = pycurl.Curl() c.setopt(pycurl.URL, str(srv_url)) #curl doesn't support unicode #c.setopt(c.RETURNTRANSFER, 1) c.setopt(c.HTTPHEADER, http_header) #must be list, not dict c.setopt(c.POST, 1) c.setopt(c.CONNECTTIMEOUT, 30) c.setopt(c.TIMEOUT, 30) c.setopt(c.WRITEFUNCTION, dump_res) c.setopt(c.POSTFIELDS, audio_data) c.setopt(c.POSTFIELDSIZE, f_len) c.perform() #pycurl.perform() has no return val if __name__ == "__main__": token = get_token() #获得token use_cloud(token) #进行处理,然后
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354 | #encoding=utf-8 import waveimport urllib, urllib2, pycurlimport base64import json## get access token by api key & secret key## 获得token,需要填写你的apikey以及secretkeydef get_token(): apiKey = "Ll0c53MSac6GBOtpg22ZSGAU**" secretKey = "44c8af396038a24e34936227d4a19dc2**" auth_url = "https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=" apiKey "&client_secret=" secretKey; res = urllib2.urlopen(auth_url) json_data = res.read() return json.loads(json_data)['access_token'] def dump_res(buf): print (buf) ## post audio to serverdef use_cloud(token): fp = wave.open('1.wav', 'rb')#录音文件名 ##已经录好音的语音片段 nf = fp.getnframes() f_len = nf * 2 audio_data = fp.readframes(nf) cuid = "7519663**" #你的产品id srv_url = 'http://vop.baidu.com/server_api' '?cuid=' cuid '&token=' token http_header = [ 'Content-Type: audio/pcm; rate=8000', 'Content-Length: %d' % f_len ] c = pycurl.Curl() c.setopt(pycurl.URL, str(srv_url)) #curl doesn't support unicode #c.setopt(c.RETURNTRANSFER, 1) c.setopt(c.HTTPHEADER, http_header) #must be list, not dict c.setopt(c.POST, 1) c.setopt(c.CONNECTTIMEOUT, 30) c.setopt(c.TIMEOUT, 30) c.setopt(c.WRITEFUNCTION, dump_res) c.setopt(c.POSTFIELDS, audio_data) c.setopt(c.POSTFIELDSIZE, f_len) c.perform() #pycurl.perform() has no return val if __name__ == "__main__": token = get_token() #获得token use_cloud(token) #进行处理,然后 |
---|
再执行python,等待一小段时间就可以返回看到
就是这样:
主要代码的思路流程很清晰的.
百度语音识别通过 REST API 的方式给开发者提供一个通用的 HTTP 接口,基于该接口,开发者可以轻松的获取语音识别能力。SDK中只提供了PHP、C和JAVA的相关样例,然而个人以为,使用Python开发难度更低,本文描述了简单使用Python调用百度语音识别服务 REST API 的简单样例。
注册开发者帐号和创建应用不再赘述,百度的REST API在调用过程基本分为三步:
- 获取token
- 提交数据
- 处理JSON