1、paddlespeech asr语音转录文字
参考:
https://github.com/PaddlePaddle/PaddleSpeech
安装后运行可能会numpy相关报错;可能是python和numpy版本高的问题,我这里最终解决是python 3.10 numpy 1.22.0;
pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple
pip install paddlespeech
1)代码
模型默认下载保存位置:C:\Users\loong.paddlespeech\models下
from paddlespeech.cli.asr.infer import ASRExecutor
asr = ASRExecutor()
result = asr(audio_file="zh.wav") ##第一次运行会首先下载自动模型
print(result)
2)实时语音转录
参考:https://www.cnblogs.com/chenkui164/p/16296941.html
https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/streaming_asr_server/README.md
https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/streaming_asr_server/web
paddlespeech_server stats --task asr ##可以擦好看支持的模型,更改模型该yaml文件
## 首先运行asr服务器
# 开启流式语音识别服务
cd PaddleSpeech/demos/streaming_asr_server
paddlespeech_server start --config_file conf/ws_conformer_wenetspeech_application_faster.yaml
运行后运行demo里的\demos\streaming_asr_server\web\index.html文件测试:
pyaudio实时录制声音及保存wav
import pyaudio,wave#导入相关的库
#实例化一个pyaudio对象
pa=pyaudio.PyAudio()
#设置声卡参数
chunk=1024#帧长度
Format=pyaudio.paInt16#采样深度
CHANNELS=2#声道
RATE=16000#采样率
record_seconds=5#设置录制时间
#RATE/chunk*record_seconds为一秒采样数除以一帧长度和录制秒数可以得到帧数
#新建一个列表,用来存储数据
record_list=[]
#打开声卡,设置参数,设置音频流
stream=pa.open(format=Format,rate=RATE,channels=CHANNELS,frames_per_buffer=chunk,input=True)
#开始录制
print('开始录制...')
#进行录制与采样
for i in range(0,int(RATE/chunk*record_seconds)):
data=stream.read(chunk)#为每一帧的样本二进制数据
record_list.append(data)#得到的是保存的二进制数据
#录制完成
stream.stop_stream()#停止调用声卡
stream.close()#关闭声卡
pa.terminate()#结束pyaudio对象
print('录制结束...')
#保存音频文件(wav文件类型)
file=wave.open('voice.wav','wb')#创建voice文件
file.setnchannels(CHANNELS)#设置声道数
file.setsampwidth(pa.get_sample_size(Format))#设置采样宽度,通过pa.get_sample_size(format)可以得到
file.setframerate(RATE)#设置采样率
file.writeframes(b''.join(record_list))#将二进制文件加入到wav文件之中
file.close()
2、sherpa 实时语音转录
参考:https://github.com/k2-fsa/sherpa-ncnn
https://www.bilibili.com/video/BV1K44y197Fg
安装:
pip install sherpa-ncnn sounddevice -i https://mirror.baidu.com/pypi/simple
下载:
1)下载项目:git clone https://github.com/k2-fsa/sherpa-ncnn.git
2)下载模型
https://huggingface.co/marcoyang/sherpa-ncnn-streaming-zipformer-zh-14M-2023-02-23
下载这7个文件
运行:
https://k2-fsa.github.io/sherpa/ncnn/python/index.html#start-recording
#!/usr/bin/env python3
# Real-time speech recognition from a microphone with sherpa-ncnn Python API
#
# Please refer to
# https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
# to download pre-trained models
import sys
try:
import sounddevice as sd
except ImportError as e:
print("Please install sounddevice first. You can use")
print()
print(" pip install sounddevice")
print()
print("to install it")
sys.exit(-1)
import sherpa_ncnn
def create_recognizer():
# Please replace the model files if needed.
# See https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html
# for download links.
recognizer = sherpa_ncnn.Recognizer(
tokens="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/tokens.txt",
encoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.param",
encoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/encoder_jit_trace-pnnx.ncnn.bin",
decoder_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.param",
decoder_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/decoder_jit_trace-pnnx.ncnn.bin",
joiner_param="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.param",
joiner_bin="./sherpa-ncnn-conv-emformer-transducer-2022-12-06/joiner_jit_trace-pnnx.ncnn.bin",
num_threads=4,
)
return recognizer
def main():
print("Started! Please speak")
recognizer = create_recognizer()
sample_rate = recognizer.sample_rate
samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
last_result = ""
with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
while True:
samples, _ = s.read(samples_per_read) # a blocking read
samples = samples.reshape(-1)
recognizer.accept_waveform(sample_rate, samples)
result = recognizer.text
if last_result != result:
last_result = result
print("\r{}".format(result), end="", flush=True)
if __name__ == "__main__":
devices = sd.query_devices()
print(devices)
default_input_device_idx = sd.default.device[0]
print(f'Use default device: {devices[default_input_device_idx]["name"]}')
try:
main()
except KeyboardInterrupt:
print("\nCaught Ctrl + C. Exiting")
**修改结果打印效果,去除重复打印结果,结果每次只打印新增的,避免上面每次都打印一遍之前已经识别的内容
if last_result != result:
if i==0:
print("{}".format(result),end='')
last_result = result
i=i+1
else:
last_result_len=len(last_result)
new_word = result[last_result_len:]
# print(last_result,result,new_word)
print("{}".format(new_word),end='', flush=True)
last_result = result