一、系统环境
操作系统:win10,win11
运行环境:dotnet8
工具:命令行,powershell
开源库:sherpa-onnx
二、工具和源码下载
开源库:https://k2-fsa.github.io/sherpa/onnx/index.html
运行环境下载
https://dotnet.microsoft.com/zh-cn/download/visual-studio-sdks?cid=getdotnetsdk
三、目录结构
四、开始编译
4.1.使用ps进入目录
cd D:\MyWork\aiwlzc\gigc
4.2. 还原工程和依赖
dotnet nuget locals all --list
4.3 编译文件生成exe
dotnet build
五、运行测试
offline-tts.exe --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ --matcha-vocoder=./vocos-22khz-univ.onnx \ --lexicon=./matcha-icefall-zh-baker/lexicon.txt \ --tokens=./matcha-icefall-zh-baker/tokens.txt \ --dict-dir=./matcha-icefall-zh-baker/dict \ --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ --debug=1 \ --output-filename=./未来之窗vits.wav \ --text='在未来的繁华都市,“未来之窗” 科技公司宛如神明般掌控世界走向。他们率先攻克可控核聚变难题,清洁、无尽的能源如电流般涌向全球,瞬间改写能源格局,让污染工厂成为历史。其研发的量子脑机接口更是神奇,戴上设备,人类能与计算机意识相连。学生眨眼间掌握海量知识,科研人员思维突破禁锢,灵感如泉涌。凭借超算与 AI 算法,精准预测地震、海啸,提前疏散民众,化险为夷。“未来之窗” 用科技锻造坚实护盾,引领人类迈向璀璨新纪元,在科技的王座上熠熠生辉 '
六、模型库下载
https://objects.githubusercontent.com/github-production-release-asset-2e65be/531380835/23b01fb6-e209-441c-b835-84e906b213e2?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250322%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250322T185542Z&X-Amz-Expires=300&X-Amz-Signature=2526dbd564fa852810b627cd969b6c539ab247c59ff36d0fd93c8166ce20f813&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dvocos-22khz-univ.onnx&response-content-type=application%2Foctet-stream
https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx
七、官方命令
dotnet run \
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
--matcha-vocoder=./vocos-22khz-univ.onnx \
--tokens=./matcha-icefall-zh-baker/tokens.txt \
--data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
--debug=1 \
--output-filename=./matcha-zh.wav \
--text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
# vits-aishell3
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
tar xvf vits-icefall-zh-aishell3.tar.bz2
dotnet run \
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
--tokens=./vits-icefall-zh-aishell3/tokens.txt \
--lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
--tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
--tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
--sid=66 \
--debug=1 \
--output-filename=./aishell3-66.wav \
--text=这是一个语音合成测试
# Piper models
wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2
dotnet run \
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
--tokens=./vits-piper-en_US-amy-low/tokens.txt \
--data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
--debug=1 \
--output-filename=./amy.wav \
--text='This is a text to speech application in dotnet with Next Generation Kaldi'
Please refer to
https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html
to download more models.
";
var helpText = HelpText.AutoBuild(result, h =>
{
h.AdditionalNewLineAfterOption = false;
h.Heading = usage;
h.Copyright = "Copyright (c) 2024 Xiaomi Corporation";
return HelpText.DefaultParsingErrorsHandler(result, h);
}, e => e);
Console.WriteLine(helpText);
}
dotnet new console -n offline-tts-play
dotnet new sln -n sherpa-onnx.sln
dotnet sln ./sherpa-onnx.sln add ./offline-tts-play
dotnet nuget locals all --list
dotnet nuget locals all --clear
八、源码
private void btn_启动_Click(object sender, EventArgs e)
{
// 模型和文件路径
string vitsModelPath = this.txt_agi_path.Text + "model.onnx";// @"./vits-melo-tts-zh_en/model.onnx";
string vitsLexiconPath = this.txt_agi_path.Text + "lexicon.txt";// @"./vits-melo-tts-zh_en/lexicon.txt";
string vitsTokensPath = this.txt_agi_path.Text + "tokens.txt";//= @"./vits-melo-tts-zh_en/tokens.txt";
string vitsDictDir = this.txt_agi_path.Text + "dict";// = @"./vits-melo-tts-zh_en/dict";
string outputFilename = Application.StartupPath + "/agivits/" + DateTime.Now.ToLongTimeString().Replace(':', '_') + ".wav";// @"./zh-en-3.wav";
string text = "它也支持繁体字. 我相信你們一定聽過愛迪生說過的這句話Genius is one percent inspiration and ninety-nine percent perspiration. ";
Options.Model= vitsModelPath;
Options.Lexicon= vitsLexiconPath;
Options.Tokens=vitsTokensPath;
// public static string OutputFilename { get; set; } = "./generated.wav";
Options.OutputFilename = outputFilename;
//
Options.DictDir= vitsDictDir;
Options.DataDir = vitsDictDir;
var config = new OfflineTtsConfig();
config.Model.Vits.Model = Options.Model;
config.Model.Vits.Lexicon = Options.Lexicon;
config.Model.Vits.Tokens = Options.Tokens;
config.Model.Vits.DataDir = Options.DataDir;
config.Model.Vits.DictDir = Options.DictDir;
config.Model.Vits.NoiseScale = Options.NoiseScale;
config.Model.Vits.NoiseScaleW = Options.NoiseScaleW;
config.Model.Vits.LengthScale = Options.LengthScale;
config.Model.Matcha.AcousticModel = Options.AcousticModel;
config.Model.Matcha.Vocoder = Options.Vocoder;
config.Model.NumThreads = 1;
config.Model.Debug = Options.Debug;
config.Model.Provider = "cpu";
config.RuleFsts = Options.RuleFsts;
config.MaxNumSentences = Options.MaxNumSentences;
var tts = new OfflineTts(config);
var speed = 1.0f / Options.LengthScale;
var sid = Options.SpeakerId;
Console.WriteLine(PortAudio.VersionInfo.versionText);
PortAudio.Initialize();
Console.WriteLine($"Number of devices: {PortAudio.DeviceCount}");
for (int i = 0; i != PortAudio.DeviceCount; ++i)
{
Console.WriteLine($" Device {i}");
DeviceInfo deviceInfo = PortAudio.GetDeviceInfo(i);
Console.WriteLine($" Name: {deviceInfo.name}");
Console.WriteLine($" Max output channels: {deviceInfo.maxOutputChannels}");
Console.WriteLine($" Default sample rate: {deviceInfo.defaultSampleRate}");
}
int deviceIndex = PortAudio.DefaultOutputDevice;
if (deviceIndex == PortAudio.NoDevice)
{
Console.WriteLine("No default output device found. Please use ../offline-tts instead");
textBox1.Text = "No default output device found. Please use ../offline-tts instead";
// Environment.Exit(1);
}
var info = PortAudio.GetDeviceInfo(deviceIndex);
Console.WriteLine();
Console.WriteLine($"Use output default device {deviceIndex} ({info.name})");
textBox1.Text = $"Use output default device {deviceIndex} ({info.name})";
var param = new StreamParameters();
param.device = deviceIndex;
param.channelCount = 1;
param.sampleFormat = SampleFormat.Float32;
param.suggestedLatency = info.defaultLowOutputLatency;
param.hostApiSpecificStreamInfo = IntPtr.Zero;
// https://learn.microsoft.com/en-us/dotnet/standard/collections/thread-safe/blockingcollection-overview
// var dataItems = new BlockingCollection<float[]>();
/*
var MyCallback = (IntPtr samples, int n) =>
{
float[] data = new float[n];
Marshal.Copy(samples, data, 0, n);
dataItems.Add(data);
// 1 means to keep generating
// 0 means to stop generating
return 1;
};
*/
var playFinished = false;
float[] lastSampleArray = null;
int lastIndex = 0; // not played
PortAudioSharp.Stream.Callback playCallback = (IntPtr input, IntPtr output,
UInt32 frameCount,
ref StreamCallbackTimeInfo timeInfo,
StreamCallbackFlags statusFlags,
IntPtr userData
) =>
{
if (dataItems.IsCompleted && lastSampleArray == null && lastIndex == 0)
{
Console.WriteLine($"Finished playing");
// textBox1.Text = $"Failed to write {Options.OutputFilename}";
playFinished = true;
return StreamCallbackResult.Complete;
}
int expected = Convert.ToInt32(frameCount);
int i = 0;
while ((lastSampleArray != null || dataItems.Count != 0) && (i < expected))
{
int needed = expected - i;
if (lastSampleArray != null)
{
int remaining = lastSampleArray.Length - lastIndex;
if (remaining >= needed)
{
float[] this_block = lastSampleArray.Skip(lastIndex).Take(needed).ToArray();
lastIndex += needed;
if (lastIndex == lastSampleArray.Length)
{
lastSampleArray = null;
lastIndex = 0;
}
Marshal.Copy(this_block, 0, IntPtr.Add(output, i * sizeof(float)), needed);
return StreamCallbackResult.Continue;
}
float[] this_block2 = lastSampleArray.Skip(lastIndex).Take(remaining).ToArray();
lastIndex = 0;
lastSampleArray = null;
Marshal.Copy(this_block2, 0, IntPtr.Add(output, i * sizeof(float)), remaining);
i += remaining;
continue;
}
if (dataItems.Count != 0)
{
lastSampleArray = dataItems.Take();
lastIndex = 0;
}
}
if (i < expected)
{
int sizeInBytes = (expected - i) * 4;
Marshal.Copy(new byte[sizeInBytes], 0, IntPtr.Add(output, i * sizeof(float)), sizeInBytes);
}
return StreamCallbackResult.Continue;
};
PortAudioSharp.Stream stream = new PortAudioSharp.Stream(inParams: null, outParams: param, sampleRate: tts.SampleRate,
framesPerBuffer: 0,
streamFlags: StreamFlags.ClipOff,
callback: playCallback,
userData: IntPtr.Zero
);
stream.Start();
var callback = new OfflineTtsCallback(MyCallback);
var audio = tts.GenerateWithCallback(Options.Text, speed, sid, callback);
var ok = audio.SaveToWaveFile(Options.OutputFilename);
if (ok)
{
Console.WriteLine($"Wrote to {Options.OutputFilename} succeeded!");
textBox1.Text = $"succeeded to write {Options.OutputFilename}";
}
else
{
Console.WriteLine($"Failed to write {Options.OutputFilename}");
textBox1.Text = $"Failed to write {Options.OutputFilename}";
}
dataItems.CompleteAdding();
while (!playFinished)
{
Thread.Sleep(100); // 100ms
}
}