Skip to content

Commit 67d2544

Browse files
committed
[v0.4.0] Plug & Play on multiple platforms!
1 parent 27ac110 commit 67d2544

9 files changed

+209
-38
lines changed

.github/workflows/nuget-publish.yml

+24-24
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,32 @@
11
name: Publish to NuGet
22

33
on:
4-
push:
5-
branches:
6-
- main
7-
paths-ignore:
8-
- 'README.md'
4+
push:
5+
branches:
6+
- main
7+
paths-ignore:
8+
- "README.md"
99

1010
jobs:
11-
publish:
12-
runs-on: ubuntu-latest
13-
steps:
14-
- uses: actions/checkout@v4
15-
- uses: actions/setup-dotnet@v4
16-
with:
17-
dotnet-version: 8.0.x
11+
publish:
12+
runs-on: ubuntu-latest
13+
steps:
14+
- uses: actions/checkout@v4
15+
- uses: actions/setup-dotnet@v4
16+
with:
17+
dotnet-version: 8.0.x
1818

19-
- name: Download Release Assets
20-
run: |
21-
mkdir -p release-assets
22-
curl -L -o release-assets/voices.zip https://github.com/Lyrcaxis/KokoroSharp/releases/download/v0.1.0/voices.zip
23-
curl -L -o release-assets/espeak.zip https://github.com/Lyrcaxis/KokoroSharp/releases/download/v0.1.0/espeak.zip
19+
- name: Download Release Assets
20+
run: |
21+
mkdir -p release-assets
22+
curl -L -o release-assets/voices.zip https://github.com/Lyrcaxis/KokoroSharpBinaries/releases/download/v1.0.0/voices.zip
23+
curl -L -o release-assets/espeak.zip https://github.com/Lyrcaxis/KokoroSharpBinaries/releases/download/v1.0.0/espeak-ng-binaries-v1.52.zip
2424
25-
- name: Unzip Release Assets
26-
run: |
27-
unzip release-assets/voices.zip -d .
28-
unzip release-assets/espeak.zip -d .
25+
- name: Unzip Release Assets
26+
run: |
27+
unzip release-assets/voices.zip -d .
28+
unzip release-assets/espeak.zip -d .
2929
30-
- run: dotnet build -c Release
31-
- run: dotnet pack -c Release
32-
- run: dotnet nuget push ./bin/Release/*.nupkg -k ${{ secrets.NUGET_API_KEY }} --source https://api.nuget.org/v3/index.json --skip-duplicate
30+
- run: dotnet build -c Release
31+
- run: dotnet pack -c Release
32+
- run: dotnet nuget push ./bin/Release/*.nupkg -k ${{ secrets.NUGET_API_KEY }} --source https://api.nuget.org/v3/index.json --skip-duplicate

HighLevel/KokoroPlayback.cs

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
namespace KokoroSharp;
22

33
using KokoroSharp.Core;
4+
using KokoroSharp.Internal;
45

56
using NAudio.Wave;
67
using System.Collections.Concurrent;
@@ -9,7 +10,7 @@
910
/// <remarks> Internally hosts a background worker thread that keeps checking for any queued samples, and plays them back if there's nothing else playing, in the same order they were queued. </remarks>
1011
public sealed class KokoroPlayback : IDisposable {
1112
public static readonly WaveFormat waveFormat = new(24000, 16, 1);
12-
readonly WaveOutEvent waveOut = new();
13+
readonly KokoroWaveOutEvent waveOut = CrossPlatformHelper.GetAudioPlayer();
1314
readonly ConcurrentQueue<PlaybackHandle> queuedPackets = [];
1415

1516
volatile bool hasExited;
@@ -63,7 +64,7 @@ internal PlaybackHandle Enqueue(float[] samples, Action OnStarted = null, Action
6364
public void StopPlayback() => waveOut.Stop();
6465

6566
/// <summary> Adjust the volume of the playback. [0.0, to 1.0] </summary>
66-
public void SetVolume(float volume) => waveOut.Volume = Math.Clamp(volume, 0f, 1f);
67+
public void SetVolume(float volume) => waveOut.SetVolume(Math.Clamp(volume, 0f, 1f));
6768

6869
/// <summary> Immediately stops the playback and notifies the background worker thread to exit. </summary>
6970
/// <remarks> Note that this DOES NOT terminate any <see cref="KokoroJob"/>s related to this instance. </remarks>

HighLevel/KokoroVoiceManager.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ public static void LoadVoicesFromPath(string voicesPath = "voices") {
1616
var voiceFilePaths = Directory.GetFiles(voicesPath);
1717

1818
foreach (var filePath in voiceFilePaths) {
19-
if (!loadedFilePaths.Add(filePath)) { continue; }
19+
if (!loadedFilePaths.Add(filePath) || !filePath.EndsWith(".npy")) { continue; }
2020
var voiceName = Path.GetFileNameWithoutExtension(filePath);
2121
var voiceFeatures = np.Load<float[,,]>(filePath);
2222
Voices.Add(new() { Name = voiceName, Features = voiceFeatures });

Internal/CrossPlatformHelper.cs

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
namespace KokoroSharp.Internal;
2+
3+
using System.Runtime.InteropServices;
4+
5+
/// <summary> Contains functionality regarding cross-platform compatibility, like providing the path to the appropriate binaries, and setting up the correct audio player. </summary>
6+
/// <remarks> All platform-specific functionality splits will go thorugh this class. </remarks>
7+
public static class CrossPlatformHelper {
8+
9+
/// <summary> Retrieves the path for the appropriate espeak-ng binaries based on the platform and architecture. </summary>
10+
/// <remarks> In case there was no matching platform/architecture combo found for the running system, will fallback to "espeak-ng". </remarks>
11+
public static string GetEspeakBinariesPath() {
12+
// On non-desktop platforms, fallback to hopefully pre-installed version of espeak-ng for versions not supported out-of-the-box by KokoroSharp.
13+
if (!(OperatingSystem.IsWindows() || OperatingSystem.IsLinux() || OperatingSystem.IsMacOS() || OperatingSystem.IsMacCatalyst())) { return "espeak-ng"; }
14+
15+
// Otherwise, build the path to the binary based on PC's specs.
16+
var espeak_cli_path = @$"{Directory.GetCurrentDirectory()}/espeak/espeak-ng-";
17+
if (OperatingSystem.IsWindows()) { espeak_cli_path += "win-"; }
18+
else if (OperatingSystem.IsLinux()) { espeak_cli_path += "linux-"; }
19+
else if (OperatingSystem.IsMacOS()) { espeak_cli_path += "macos-"; }
20+
else if (OperatingSystem.IsMacCatalyst()) { espeak_cli_path += "macos-"; }
21+
espeak_cli_path += (RuntimeInformation.ProcessArchitecture == Architecture.Arm64 ? "arm64.dll" : "amd64.dll");
22+
23+
return File.Exists(espeak_cli_path) ? espeak_cli_path : "espeak-ng"; // In case developers did not include the espeak folder at all.
24+
}
25+
26+
/// <summary> Retrieves the appropriate audio player for the running system: <b>NAudio.WaveOutEvent wrapper</b> for Windows, or <b>AL wrapper</b> for other OS. </summary>
27+
public static KokoroWaveOutEvent GetAudioPlayer() {
28+
if (OperatingSystem.IsWindows()) { return new WindowsAudioPlayer(); }
29+
if (OperatingSystem.IsMacOS()) { return new MacOSAudioPlayer(); }
30+
if (OperatingSystem.IsMacCatalyst()) { return new MacOSAudioPlayer(); }
31+
if (OperatingSystem.IsLinux()) { return new LinuxAudioPlayer(); }
32+
33+
// Fallback. Might work for Android/iOS too?
34+
return new LinuxAudioPlayer(); // Who knows!
35+
}
36+
}

Internal/KokoroWaveOutEvent.cs

+130
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
namespace KokoroSharp.Internal;
2+
3+
using NAudio.Wave;
4+
5+
using OpenTK.Audio.OpenAL;
6+
7+
using System.Diagnostics;
8+
9+
/// <summary> Base class for cross platform playback, with API compatible with NAudio's API. </summary>
10+
/// <remarks> Each platform (Windows/Linux/MacOS) derives from this to expose a nice interface back to KokoroSharp. </remarks>
11+
public abstract class KokoroWaveOutEvent {
12+
public RawSourceWaveStream stream;
13+
14+
/// <summary> Playback state. </summary>
15+
public abstract PlaybackState PlaybackState { get; }
16+
17+
/// <summary> Initializes the buffer with an audio stream. </summary>
18+
public void Init(RawSourceWaveStream stream) => this.stream = stream;
19+
20+
/// <summary> Plays back the audio stream that was initialized with. </summary>
21+
public abstract void Play();
22+
23+
/// <summary> Immediately stops the playback. Does not delete the 'stream' though. </summary>
24+
public abstract void Stop();
25+
26+
/// <summary> Adjust the volume of the playback. [0.0, to 1.0] </summary>
27+
public abstract void SetVolume(float volume);
28+
29+
/// <summary> Disposes the instance. </summary>
30+
public abstract void Dispose();
31+
32+
/// <summary> Gets the percentage of how much was played </summary>
33+
public virtual float CurrentPercentage => stream.Position / (float) stream.Length;
34+
35+
/// <summary> Pause not supported for simplicity. </summary>
36+
public void Pause() => throw new NotImplementedException("We're not gonna support this.");
37+
}
38+
39+
// A wrapper for NAudio's WaveOutEvent.
40+
public class WindowsAudioPlayer : KokoroWaveOutEvent {
41+
readonly WaveOutEvent waveOut = new();
42+
public override PlaybackState PlaybackState => waveOut.PlaybackState;
43+
public override void Dispose() => waveOut.Dispose();
44+
public override void Play() { waveOut.Init(stream); waveOut.Play(); }
45+
public override void SetVolume(float volume) => waveOut.Volume = volume;
46+
public override void Stop() => waveOut.Stop();
47+
}
48+
49+
public class MacOSAudioPlayer : LinuxAudioPlayer { }
50+
51+
// Warning: Terrible, TERRIBLE code..
52+
public class LinuxAudioPlayer : KokoroWaveOutEvent {
53+
public static int BufferSize = 4096 * 64; // Yes it's long. Could use help to optimize.
54+
public static int BufferCount = 256; // 64 MB. Devs can shorten it if needed.
55+
56+
int source;
57+
int[] buffers;
58+
Thread streamThread;
59+
bool stopRequested;
60+
PlaybackState state = PlaybackState.Stopped;
61+
62+
public override PlaybackState PlaybackState => state;
63+
64+
// ATM it's joining and creating new thread each time. Not the best idea.
65+
public override void Play() {
66+
if (streamThread != null) { Stop(); }
67+
var device = ALC.OpenDevice(null);
68+
var context = ALC.CreateContext(device, (int[]) null);
69+
ALC.MakeContextCurrent(context);
70+
source = AL.GenSource();
71+
buffers = AL.GenBuffers(BufferCount);
72+
stopRequested = false;
73+
74+
// Initialize the buffer
75+
for (int i = 0; i < BufferCount; i++) {
76+
if (GetBufferFromStream() is not byte[] data) { break; }
77+
FillALBuffer(buffers[i], data);
78+
}
79+
AL.SourceQueueBuffers(source, buffers);
80+
AL.SourcePlay(source);
81+
state = PlaybackState.Playing;
82+
83+
streamThread = new Thread(() => {
84+
AL.GetSource(source, ALGetSourcei.BuffersProcessed, out int processed);
85+
86+
var sw = Stopwatch.StartNew();
87+
while (processed-- > 0 && !stopRequested) {
88+
int buf = AL.SourceUnqueueBuffer(source);
89+
if (GetBufferFromStream() is not byte[] data) { break; }
90+
FillALBuffer(buf, data);
91+
AL.SourceQueueBuffer(source, buf);
92+
Thread.Sleep(10);
93+
}
94+
95+
while (!stopRequested && AL.GetSource(source, ALGetSourcei.SourceState) == (int) ALSourceState.Playing) {
96+
stream.Position = (int) ((sw.ElapsedMilliseconds / 1000f) * stream.WaveFormat.AverageBytesPerSecond);
97+
Thread.Sleep(10);
98+
}
99+
if (!stopRequested) { stream.Position = stream.Length; }
100+
else { stream.Position = (int) ((sw.ElapsedMilliseconds / 1000f) * stream.WaveFormat.AverageBytesPerSecond); }
101+
102+
state = PlaybackState.Stopped;
103+
});
104+
streamThread.Start();
105+
106+
unsafe void FillALBuffer(int buffer, byte[] data) { fixed (byte* ptr = data) { AL.BufferData(buffer, ALFormat.Mono16, (IntPtr) ptr, data.Length, stream.WaveFormat.SampleRate); } }
107+
byte[] GetBufferFromStream() {
108+
var buffer = new byte[BufferSize];
109+
int bytesRead = stream.Read(buffer, 0, BufferSize);
110+
if (bytesRead < BufferSize) { Array.Resize(ref buffer, bytesRead); }
111+
return bytesRead > 0 ? buffer : null;
112+
}
113+
}
114+
115+
public override void Stop() => Dispose();
116+
public override void SetVolume(float volume) => AL.Source(source, ALSourcef.Gain, Math.Clamp(volume, 0, 1f)); // Technically supports > 1 volume but not sure if it's a good idea.
117+
public override void Dispose() {
118+
AL.SourceStop(source);
119+
state = PlaybackState.Stopped;
120+
stopRequested = true;
121+
streamThread?.Join();
122+
streamThread = null;
123+
AL.DeleteSource(source);
124+
AL.DeleteBuffers(buffers);
125+
var context = ALC.GetCurrentContext();
126+
var device = ALC.GetContextsDevice(context);
127+
ALC.DestroyContext(context);
128+
ALC.CloseDevice(device);
129+
}
130+
}

KokoroSharp.csproj

+4-2
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,22 @@
88
</PropertyGroup>
99

1010
<PropertyGroup>
11-
<Version>0.3.11</Version>
11+
<Version>0.4.0</Version>
1212
<PackageId>KokoroSharp</PackageId>
1313
<Authors>Lyrcaxis</Authors>
1414
<Description>An inference engine for Kokoro TTS with ONNX runtime, enabling fast and flexible local text-to-speech (fp/quanted) purely via C#. It features segment streaming, voice mixing, linear job scheduling, and optional playback.</Description>
1515
<RepositoryUrl>https://github.com/Lyrcaxis/KokoroSharp</RepositoryUrl>
16-
<PackageTags>Kokoro, TTS, AI, ONNX, SpeechSynthesis, TextToSpeech, Text, To, Speech</PackageTags>
16+
<PackageTags>Kokoro, TextToSpeech, TTS, ONNX, AI, SpeechSynthesis, Text, To, Speech, CrossPlatform, .NET, Windows, Linux, MacOS, Offline</PackageTags>
1717
<PackageReadmeFile>README.md</PackageReadmeFile>
1818
<PackageLicenseExpression>MIT</PackageLicenseExpression>
19+
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
1920
</PropertyGroup>
2021

2122
<ItemGroup>
2223
<PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.20.1" />
2324
<PackageReference Include="NAudio" Version="2.2.1" />
2425
<PackageReference Include="NumSharp" Version="0.30.0" />
26+
<PackageReference Include="OpenTK.Audio.OpenAL" Version="5.0.0-pre.13" />
2527
<PackageReference Include="System.Numerics.Tensors" Version="9.0.1" />
2628
</ItemGroup>
2729

Program.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,11 @@ static void Main(string[] _) {
8080
tts.EnqueueJob(new KokoroPauseJob() { PauseTime = 2f, OnComplete = playback.Enqueue });
8181

8282
// And can also manually load the voice from the path you want, as a float array...
83-
float[,,] michaelNPY = NumSharp.np.Load<float[,,]>(@"voices\am_michael.npy");
83+
float[,,] michaelNPY = NumSharp.np.Load<float[,,]>(@"voices/am_michael.npy");
8484
tts.EnqueueJob(KokoroJob.Create(ttokens, michaelNPY, speed:0.8f, playback.Enqueue));
8585

8686
// ...or as a KokoroVoice. Those types are fully interchangeable with each other.
87-
KokoroVoice onyxVoice = KokoroVoice.FromPath(@"voices\am_onyx.npy");
87+
KokoroVoice onyxVoice = KokoroVoice.FromPath(@"voices/am_onyx.npy");
8888
tts.EnqueueJob(KokoroJob.Create(ttokens, onyxVoice, speed:1.2f, playback.Enqueue));
8989
}
9090
}

README.md

+4-4
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,8 @@ With a custom phonemization solution, these additional languages are also suppor
2020
- `[MandarinChinese, Japanese, Hindi]`.
2121

2222
## How to setup
23-
###### You can download the Kokoro v1.0 ONNX models from [taylorchu's repository's releases](https://github.com/taylorchu/kokoro-onnx/releases/tag/v0.2.0).
24-
- **On Windows:** Install via **Nuget** ([Package Manager](https://learn.microsoft.com/en-us/nuget/quickstart/install-and-use-a-package-in-visual-studio) or [CLI](https://learn.microsoft.com/en-us/nuget/quickstart/install-and-use-a-package-using-the-dotnet-cli)), then [download the ONNX model](https://github.com/taylorchu/kokoro-onnx/releases/tag/v0.2.0), and you're set!
25-
- **On Linux and MacOS**: In addition to the above, you will need to manually install eSpeak NG for phonemization support. Installation instructions can be found on the [eSpeak NG GitHub repository](https://github.com/espeak-ng/espeak-ng).
23+
- **First, download the Kokoro v1.0 ONNX models from [taylorchu's repository's releases](https://github.com/taylorchu/kokoro-onnx/releases/tag/v0.2.0).**
24+
- **On Windows, Linux, and MacOS:** Install via **Nuget** ([Package Manager](https://learn.microsoft.com/en-us/nuget/quickstart/install-and-use-a-package-in-visual-studio) or [CLI](https://learn.microsoft.com/en-us/nuget/quickstart/install-and-use-a-package-using-the-dotnet-cli)), and you're set!
2625
- **On Other platforms**: For platforms other than the ones above, developers are expected to provide their own phonemization solution. The built-in tokenizer supports raw `(phonemes -> tokens)` conversion.
2726

2827
###### The package is accessible on all .NET platforms, yet integrated phonemization is only available with the eSpeak NG backend atm.
@@ -43,7 +42,7 @@ Above is a simple way to get started on the highest level. For more control, che
4342

4443
- All communication with the AI model and playback devices happens on background threads, letting the main thread focus on rendering the UI in peace. The library is carefully designed with thread-safety in mind.
4544

46-
- The `voices` folder are automatically copied to your build path when you build and are ready to be accessed. Same with the `espeak` backend for Windows. Developers may opt to remove them when shipping their apps.
45+
- The `voices` folder are automatically copied to your build path when you build and are ready to be accessed. Same with the mentioned `espeak` backends. Developers may opt to remove them when shipping their apps.
4746

4847
- Mind that `LoadVoicesFromPath` exists as an option, in case developers want to implement their custom voice-loading logic when shipping a project that utilizes KokoroSharp for text-to-speech synthesis.
4948

@@ -52,3 +51,4 @@ Above is a simple way to get started on the highest level. For more control, che
5251
## License
5352
- This project is licensed under the [MIT License](https://github.com/Lyrcaxis/KokoroSharp/blob/main/LICENSE).
5453
- The [Kokoro 82M model](https://huggingface.co/hexgrad/Kokoro-82M) and its voices are released under the [Apache License](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md).
54+
- eSpeak NG is licensed under the [GPLv3 License](https://github.com/espeak-ng/espeak-ng/blob/master/COPYING).

Tokenization/Tokenizer.cs

+5-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
namespace KokoroSharp.Tokenization;
22

3+
using KokoroSharp.Internal;
4+
35
using System.Diagnostics;
6+
using System.Runtime.InteropServices;
47
using System.Text;
58
using System.Text.RegularExpressions;
69

@@ -57,10 +60,9 @@ static string Phonemize(string inputText, string langCode, bool preprocess = tru
5760
/// <summary> Invokes the espeak-ng via command line, to convert given text into phonemes. </summary>
5861
/// <remarks> Espeak will return a line ending when it meets any of the <see cref="PunctuationTokens"/> and gets rid of any punctuation, so these will have to be converted back to a single-line, with the punctuation restored. </remarks>
5962
static string Phonemize_Internal(string text, out string originalSegments, string langCode = "en-us") {
60-
var espeak_cli_path = OperatingSystem.IsWindows() ? @$"{Directory.GetCurrentDirectory()}\espeak\espeak-ng" : "espeak-ng";
6163
using var process = new Process() {
6264
StartInfo = new ProcessStartInfo() {
63-
FileName = espeak_cli_path,
65+
FileName = CrossPlatformHelper.GetEspeakBinariesPath(),
6466
WorkingDirectory = null,
6567
Arguments = $"--ipa=3 -q -v {langCode} \"{text}\"",
6668
RedirectStandardInput = false,
@@ -70,7 +72,7 @@ static string Phonemize_Internal(string text, out string originalSegments, strin
7072
StandardOutputEncoding = Encoding.UTF8
7173
}
7274
};
73-
process.StartInfo.EnvironmentVariables.Add("ESPEAK_DATA_PATH", @$"{Directory.GetCurrentDirectory()}\espeak\espeak-ng-data");
75+
process.StartInfo.EnvironmentVariables.Add("ESPEAK_DATA_PATH", @$"{Directory.GetCurrentDirectory()}/espeak/espeak-ng-data");
7476
process.Start();
7577
originalSegments = process.StandardOutput.ReadToEnd();
7678
process.StandardOutput.Close();

0 commit comments

Comments
 (0)