[v0.4.0] Plug & Play on multiple platforms!

Lyrcaxis · Lyrcaxis · commit 67d25444b482 · 2025-02-05T22:46:54.000+02:00
diff --git a/.github/workflows/nuget-publish.yml b/.github/workflows/nuget-publish.yml
@@ -1,32 +1,32 @@
 name: Publish to NuGet
 
 on:
-  push:
-    branches:
-        - main
-    paths-ignore:
-        - 'README.md'
+    push:
+        branches:
+            - main
+        paths-ignore:
+            - "README.md"
 
 jobs:
-  publish:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v4
-    - uses: actions/setup-dotnet@v4
-      with:
-        dotnet-version: 8.0.x
+    publish:
+        runs-on: ubuntu-latest
+        steps:
+            - uses: actions/checkout@v4
+            - uses: actions/setup-dotnet@v4
+              with:
+                  dotnet-version: 8.0.x
 
-    - name: Download Release Assets
-      run: |
-        mkdir -p release-assets
-        curl -L -o release-assets/voices.zip https://github.com/Lyrcaxis/KokoroSharp/releases/download/v0.1.0/voices.zip
-        curl -L -o release-assets/espeak.zip https://github.com/Lyrcaxis/KokoroSharp/releases/download/v0.1.0/espeak.zip
+            - name: Download Release Assets
+              run: |
+                  mkdir -p release-assets
+                  curl -L -o release-assets/voices.zip https://github.com/Lyrcaxis/KokoroSharpBinaries/releases/download/v1.0.0/voices.zip
+                  curl -L -o release-assets/espeak.zip https://github.com/Lyrcaxis/KokoroSharpBinaries/releases/download/v1.0.0/espeak-ng-binaries-v1.52.zip
 
-    - name: Unzip Release Assets
-      run: |
-        unzip release-assets/voices.zip -d .
-        unzip release-assets/espeak.zip -d .
+            - name: Unzip Release Assets
+              run: |
+                  unzip release-assets/voices.zip -d .
+                  unzip release-assets/espeak.zip -d .
 
-    - run: dotnet build -c Release
-    - run: dotnet pack -c Release
-    - run: dotnet nuget push ./bin/Release/*.nupkg -k ${{ secrets.NUGET_API_KEY }} --source https://api.nuget.org/v3/index.json --skip-duplicate
+            - run: dotnet build -c Release
+            - run: dotnet pack -c Release
+            - run: dotnet nuget push ./bin/Release/*.nupkg -k ${{ secrets.NUGET_API_KEY }} --source https://api.nuget.org/v3/index.json --skip-duplicate
diff --git a/HighLevel/KokoroPlayback.cs b/HighLevel/KokoroPlayback.cs
@@ -1,6 +1,7 @@
 ﻿namespace KokoroSharp;
 
 using KokoroSharp.Core;
+using KokoroSharp.Internal;
 
 using NAudio.Wave;
 using System.Collections.Concurrent;
@@ -9,7 +10,7 @@
 /// <remarks> Internally hosts a background worker thread that keeps checking for any queued samples, and plays them back if there's nothing else playing, in the same order they were queued. </remarks>
 public sealed class KokoroPlayback : IDisposable {
     public static readonly WaveFormat waveFormat = new(24000, 16, 1);
-    readonly WaveOutEvent waveOut = new();
+    readonly KokoroWaveOutEvent waveOut = CrossPlatformHelper.GetAudioPlayer();
     readonly ConcurrentQueue<PlaybackHandle> queuedPackets = [];
 
     volatile bool hasExited;
@@ -63,7 +64,7 @@ internal PlaybackHandle Enqueue(float[] samples, Action OnStarted = null, Action
     public void StopPlayback() => waveOut.Stop();
 
     /// <summary> Adjust the volume of the playback. [0.0, to 1.0] </summary>
-    public void SetVolume(float volume) => waveOut.Volume = Math.Clamp(volume, 0f, 1f);
+    public void SetVolume(float volume) => waveOut.SetVolume(Math.Clamp(volume, 0f, 1f));
 
     /// <summary> Immediately stops the playback and notifies the background worker thread to exit. </summary>
     /// <remarks> Note that this DOES NOT terminate any <see cref="KokoroJob"/>s related to this instance. </remarks>
diff --git a/HighLevel/KokoroVoiceManager.cs b/HighLevel/KokoroVoiceManager.cs
@@ -16,7 +16,7 @@ public static void LoadVoicesFromPath(string voicesPath = "voices") {
         var voiceFilePaths = Directory.GetFiles(voicesPath);
 
         foreach (var filePath in voiceFilePaths) {
-            if (!loadedFilePaths.Add(filePath)) { continue; }
+            if (!loadedFilePaths.Add(filePath) || !filePath.EndsWith(".npy")) { continue; }
             var voiceName = Path.GetFileNameWithoutExtension(filePath);
             var voiceFeatures = np.Load<float[,,]>(filePath);
             Voices.Add(new() { Name = voiceName, Features = voiceFeatures });
diff --git a/Internal/CrossPlatformHelper.cs b/Internal/CrossPlatformHelper.cs
@@ -0,0 +1,36 @@
+﻿namespace KokoroSharp.Internal;
+
+using System.Runtime.InteropServices;
+
+/// <summary> Contains functionality regarding cross-platform compatibility, like providing the path to the appropriate binaries, and setting up the correct audio player. </summary>
+/// <remarks> All platform-specific functionality splits will go thorugh this class. </remarks>
+public static class CrossPlatformHelper {
+
+    /// <summary> Retrieves the path for the appropriate espeak-ng binaries based on the platform and architecture. </summary>
+    /// <remarks> In case there was no matching platform/architecture combo found for the running system, will fallback to "espeak-ng". </remarks>
+    public static string GetEspeakBinariesPath() {
+        // On non-desktop platforms, fallback to hopefully pre-installed version of espeak-ng for versions not supported out-of-the-box by KokoroSharp.
+        if (!(OperatingSystem.IsWindows() || OperatingSystem.IsLinux() || OperatingSystem.IsMacOS() || OperatingSystem.IsMacCatalyst())) { return "espeak-ng"; }
+
+        // Otherwise, build the path to the binary based on PC's specs.
+        var espeak_cli_path = @$"{Directory.GetCurrentDirectory()}/espeak/espeak-ng-";
+        if (OperatingSystem.IsWindows()) { espeak_cli_path += "win-"; }
+        else if (OperatingSystem.IsLinux()) { espeak_cli_path += "linux-"; }
+        else if (OperatingSystem.IsMacOS()) { espeak_cli_path += "macos-"; }
+        else if (OperatingSystem.IsMacCatalyst()) { espeak_cli_path += "macos-"; }
+        espeak_cli_path += (RuntimeInformation.ProcessArchitecture == Architecture.Arm64 ? "arm64.dll" : "amd64.dll");
+
+        return File.Exists(espeak_cli_path) ? espeak_cli_path : "espeak-ng"; // In case developers did not include the espeak folder at all.
+    }
+
+    /// <summary> Retrieves the appropriate audio player for the running system: <b>NAudio.WaveOutEvent wrapper</b> for Windows, or <b>AL wrapper</b> for other OS. </summary>
+    public static KokoroWaveOutEvent GetAudioPlayer() {
+        if (OperatingSystem.IsWindows()) { return new WindowsAudioPlayer(); }
+        if (OperatingSystem.IsMacOS()) { return new MacOSAudioPlayer(); }
+        if (OperatingSystem.IsMacCatalyst()) { return new MacOSAudioPlayer(); }
+        if (OperatingSystem.IsLinux()) { return new LinuxAudioPlayer(); }
+
+        // Fallback. Might work for Android/iOS too?
+        return new LinuxAudioPlayer(); // Who knows!
+    }
+}
diff --git a/Internal/KokoroWaveOutEvent.cs b/Internal/KokoroWaveOutEvent.cs
@@ -0,0 +1,130 @@
+﻿namespace KokoroSharp.Internal;
+
+using NAudio.Wave;
+
+using OpenTK.Audio.OpenAL;
+
+using System.Diagnostics;
+
+/// <summary> Base class for cross platform playback, with API compatible with NAudio's API. </summary>
+/// <remarks> Each platform (Windows/Linux/MacOS) derives from this to expose a nice interface back to KokoroSharp. </remarks>
+public abstract class KokoroWaveOutEvent {
+    public RawSourceWaveStream stream;
+
+    /// <summary> Playback state. </summary>
+    public abstract PlaybackState PlaybackState { get; }
+
+    /// <summary> Initializes the buffer with an audio stream. </summary>
+    public void Init(RawSourceWaveStream stream) => this.stream = stream;
+
+    /// <summary> Plays back the audio stream that was initialized with. </summary>
+    public abstract void Play();
+
+    /// <summary> Immediately stops the playback. Does not delete the 'stream' though. </summary>
+    public abstract void Stop();
+
+    /// <summary> Adjust the volume of the playback. [0.0, to 1.0] </summary>
+    public abstract void SetVolume(float volume);
+
+    /// <summary> Disposes the instance. </summary>
+    public abstract void Dispose();
+
+    /// <summary> Gets the percentage of how much was played </summary>
+    public virtual float CurrentPercentage => stream.Position / (float) stream.Length;
+
+    /// <summary> Pause not supported for simplicity. </summary>
+    public void Pause() => throw new NotImplementedException("We're not gonna support this.");
+}
+
+// A wrapper for NAudio's WaveOutEvent.
+public class WindowsAudioPlayer : KokoroWaveOutEvent {
+    readonly WaveOutEvent waveOut = new();
+    public override PlaybackState PlaybackState => waveOut.PlaybackState;
+    public override void Dispose() => waveOut.Dispose();
+    public override void Play() { waveOut.Init(stream); waveOut.Play(); }
+    public override void SetVolume(float volume) => waveOut.Volume = volume;
+    public override void Stop() => waveOut.Stop();
+}
+
+public class MacOSAudioPlayer : LinuxAudioPlayer { }
+
+// Warning: Terrible, TERRIBLE code..
+public class LinuxAudioPlayer : KokoroWaveOutEvent {
+    public static int BufferSize = 4096 * 64;   // Yes it's long. Could use help to optimize.
+    public static int BufferCount = 256; // 64 MB. Devs can shorten it if needed.
+
+    int source;
+    int[] buffers;
+    Thread streamThread;
+    bool stopRequested;
+    PlaybackState state = PlaybackState.Stopped;
+
+    public override PlaybackState PlaybackState => state;
+
+    // ATM it's joining and creating new thread each time. Not the best idea.
+    public override void Play() {
+        if (streamThread != null) { Stop(); }
+        var device = ALC.OpenDevice(null);
+        var context = ALC.CreateContext(device, (int[]) null);
+        ALC.MakeContextCurrent(context);
+        source = AL.GenSource();
+        buffers = AL.GenBuffers(BufferCount);
+        stopRequested = false;
+
+        // Initialize the buffer
+        for (int i = 0; i < BufferCount; i++) {
+            if (GetBufferFromStream() is not byte[] data) { break; }
+            FillALBuffer(buffers[i], data);
+        }
+        AL.SourceQueueBuffers(source, buffers);
+        AL.SourcePlay(source);
+        state = PlaybackState.Playing;
+
+        streamThread = new Thread(() => {
+            AL.GetSource(source, ALGetSourcei.BuffersProcessed, out int processed);
+
+            var sw = Stopwatch.StartNew();
+            while (processed-- > 0 && !stopRequested) {
+                int buf = AL.SourceUnqueueBuffer(source);
+                if (GetBufferFromStream() is not byte[] data) { break; }
+                FillALBuffer(buf, data);
+                AL.SourceQueueBuffer(source, buf);
+                Thread.Sleep(10);
+            }
+
+            while (!stopRequested && AL.GetSource(source, ALGetSourcei.SourceState) == (int) ALSourceState.Playing) {
+                stream.Position = (int) ((sw.ElapsedMilliseconds / 1000f) * stream.WaveFormat.AverageBytesPerSecond);
+                Thread.Sleep(10);
+            }
+            if (!stopRequested) { stream.Position = stream.Length; }
+            else { stream.Position = (int) ((sw.ElapsedMilliseconds / 1000f) * stream.WaveFormat.AverageBytesPerSecond); }
+
+            state = PlaybackState.Stopped;
+        });
+        streamThread.Start();
+
+        unsafe void FillALBuffer(int buffer, byte[] data) { fixed (byte* ptr = data) { AL.BufferData(buffer, ALFormat.Mono16, (IntPtr) ptr, data.Length, stream.WaveFormat.SampleRate); } }
+        byte[] GetBufferFromStream() {
+            var buffer = new byte[BufferSize];
+            int bytesRead = stream.Read(buffer, 0, BufferSize);
+            if (bytesRead < BufferSize) { Array.Resize(ref buffer, bytesRead); }
+            return bytesRead > 0 ? buffer : null;
+        }
+    }
+
+    public override void Stop() => Dispose();
+    public override void SetVolume(float volume) => AL.Source(source, ALSourcef.Gain, Math.Clamp(volume, 0, 1f)); // Technically supports > 1 volume but not sure if it's a good idea.
+    public override void Dispose() {
+        AL.SourceStop(source);
+        state = PlaybackState.Stopped;
+        stopRequested = true;
+        streamThread?.Join();
+        streamThread = null;
+        AL.DeleteSource(source);
+        AL.DeleteBuffers(buffers);
+        var context = ALC.GetCurrentContext();
+        var device = ALC.GetContextsDevice(context);
+        ALC.DestroyContext(context);
+        ALC.CloseDevice(device);
+    }
+}
diff --git a/KokoroSharp.csproj b/KokoroSharp.csproj
@@ -8,20 +8,22 @@
     </PropertyGroup>
 
     <PropertyGroup>
-        <Version>0.3.11</Version>
+        <Version>0.4.0</Version>
         <PackageId>KokoroSharp</PackageId>
         <Authors>Lyrcaxis</Authors>
         <Description>An inference engine for Kokoro TTS with ONNX runtime, enabling fast and flexible local text-to-speech (fp/quanted) purely via C#. It features segment streaming, voice mixing, linear job scheduling, and optional playback.</Description>
         <RepositoryUrl>https://github.com/Lyrcaxis/KokoroSharp</RepositoryUrl>
-        <PackageTags>Kokoro, TTS, AI, ONNX, SpeechSynthesis, TextToSpeech, Text, To, Speech</PackageTags>
+        <PackageTags>Kokoro, TextToSpeech, TTS, ONNX, AI, SpeechSynthesis, Text, To, Speech, CrossPlatform, .NET, Windows, Linux, MacOS, Offline</PackageTags>
         <PackageReadmeFile>README.md</PackageReadmeFile>
         <PackageLicenseExpression>MIT</PackageLicenseExpression>
+        <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     </PropertyGroup>
 
     <ItemGroup>
         <PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.20.1" />
         <PackageReference Include="NAudio" Version="2.2.1" />
         <PackageReference Include="NumSharp" Version="0.30.0" />
+        <PackageReference Include="OpenTK.Audio.OpenAL" Version="5.0.0-pre.13" />
         <PackageReference Include="System.Numerics.Tensors" Version="9.0.1" />
     </ItemGroup>
 
diff --git a/Program.cs b/Program.cs
@@ -80,11 +80,11 @@ static void Main(string[] _) {
             tts.EnqueueJob(new KokoroPauseJob() { PauseTime = 2f, OnComplete = playback.Enqueue });
 
             // And can also manually load the voice from the path you want, as a float array...
-            float[,,] michaelNPY = NumSharp.np.Load<float[,,]>(@"voices\am_michael.npy");
+            float[,,] michaelNPY = NumSharp.np.Load<float[,,]>(@"voices/am_michael.npy");
             tts.EnqueueJob(KokoroJob.Create(ttokens, michaelNPY, speed:0.8f, playback.Enqueue));
 
             // ...or as a KokoroVoice. Those types are fully interchangeable with each other.
-            KokoroVoice onyxVoice = KokoroVoice.FromPath(@"voices\am_onyx.npy");
+            KokoroVoice onyxVoice = KokoroVoice.FromPath(@"voices/am_onyx.npy");
             tts.EnqueueJob(KokoroJob.Create(ttokens, onyxVoice, speed:1.2f, playback.Enqueue));
         }
     }
diff --git a/README.md b/README.md
@@ -20,9 +20,8 @@ With a custom phonemization solution, these additional languages are also suppor
 - `[MandarinChinese, Japanese, Hindi]`.
 
 ## How to setup
-###### You can download the Kokoro v1.0 ONNX models from [taylorchu's repository's releases](https://github.com/taylorchu/kokoro-onnx/releases/tag/v0.2.0).
-- **On Windows:** Install via **Nuget** ([Package Manager](https://learn.microsoft.com/en-us/nuget/quickstart/install-and-use-a-package-in-visual-studio) or [CLI](https://learn.microsoft.com/en-us/nuget/quickstart/install-and-use-a-package-using-the-dotnet-cli)), then [download the ONNX model](https://github.com/taylorchu/kokoro-onnx/releases/tag/v0.2.0), and you're set!
-- **On Linux and MacOS**: In addition to the above, you will need to manually install eSpeak NG for phonemization support. Installation instructions can be found on the [eSpeak NG GitHub repository](https://github.com/espeak-ng/espeak-ng).
+- **First, download the Kokoro v1.0 ONNX models from [taylorchu's repository's releases](https://github.com/taylorchu/kokoro-onnx/releases/tag/v0.2.0).**
+- **On Windows, Linux, and MacOS:** Install via **Nuget** ([Package Manager](https://learn.microsoft.com/en-us/nuget/quickstart/install-and-use-a-package-in-visual-studio) or [CLI](https://learn.microsoft.com/en-us/nuget/quickstart/install-and-use-a-package-using-the-dotnet-cli)), and you're set!
 - **On Other platforms**: For platforms other than the ones above, developers are expected to provide their own phonemization solution. The built-in tokenizer supports raw `(phonemes -> tokens)` conversion.
 
 ###### The package is accessible on all .NET platforms, yet integrated phonemization is only available with the eSpeak NG backend atm.
@@ -43,7 +42,7 @@ Above is a simple way to get started on the highest level. For more control, che
 
 - All communication with the AI model and playback devices happens on background threads, letting the main thread focus on rendering the UI in peace. The library is carefully designed with thread-safety in mind.
 
-- The `voices` folder are automatically copied to your build path when you build and are ready to be accessed. Same with the `espeak` backend for Windows. Developers may opt to remove them when shipping their apps.
+- The `voices` folder are automatically copied to your build path when you build and are ready to be accessed. Same with the mentioned `espeak` backends. Developers may opt to remove them when shipping their apps.
 
 - Mind that `LoadVoicesFromPath` exists as an option, in case developers want to implement their custom voice-loading logic when shipping a project that utilizes KokoroSharp for text-to-speech synthesis.
 
@@ -52,3 +51,4 @@ Above is a simple way to get started on the highest level. For more control, che
 ## License
 - This project is licensed under the [MIT License](https://github.com/Lyrcaxis/KokoroSharp/blob/main/LICENSE).
 - The [Kokoro 82M model](https://huggingface.co/hexgrad/Kokoro-82M) and its voices are released under the [Apache License](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md).
+- eSpeak NG is licensed under the [GPLv3 License](https://github.com/espeak-ng/espeak-ng/blob/master/COPYING).
diff --git a/Tokenization/Tokenizer.cs b/Tokenization/Tokenizer.cs
@@ -1,6 +1,9 @@
 ﻿namespace KokoroSharp.Tokenization;
 
+using KokoroSharp.Internal;
+
 using System.Diagnostics;
+using System.Runtime.InteropServices;
 using System.Text;
 using System.Text.RegularExpressions;
 
@@ -57,10 +60,9 @@ static string Phonemize(string inputText, string langCode, bool preprocess = tru
     /// <summary> Invokes the espeak-ng via command line, to convert given text into phonemes. </summary>
     /// <remarks> Espeak will return a line ending when it meets any of the <see cref="PunctuationTokens"/> and gets rid of any punctuation, so these will have to be converted back to a single-line, with the punctuation restored. </remarks>
     static string Phonemize_Internal(string text, out string originalSegments, string langCode = "en-us") {
-        var espeak_cli_path = OperatingSystem.IsWindows() ? @$"{Directory.GetCurrentDirectory()}\espeak\espeak-ng" : "espeak-ng";
         using var process = new Process() {
             StartInfo = new ProcessStartInfo() {
-                FileName = espeak_cli_path,
+                FileName = CrossPlatformHelper.GetEspeakBinariesPath(),
                 WorkingDirectory = null,
                 Arguments = $"--ipa=3 -q -v {langCode} \"{text}\"",
                 RedirectStandardInput = false,
@@ -70,7 +72,7 @@ static string Phonemize_Internal(string text, out string originalSegments, strin
                 StandardOutputEncoding = Encoding.UTF8
             }
         };
-        process.StartInfo.EnvironmentVariables.Add("ESPEAK_DATA_PATH", @$"{Directory.GetCurrentDirectory()}\espeak\espeak-ng-data");
+        process.StartInfo.EnvironmentVariables.Add("ESPEAK_DATA_PATH", @$"{Directory.GetCurrentDirectory()}/espeak/espeak-ng-data");
         process.Start();
         originalSegments = process.StandardOutput.ReadToEnd();
         process.StandardOutput.Close();

Original file line number	Diff line number	Diff line change
`@@ -80,11 +80,11 @@ static void Main(string[] _) {`
`80`	`80`	`tts.EnqueueJob(new KokoroPauseJob() { PauseTime = 2f, OnComplete = playback.Enqueue });`
`81`	`81`
`82`	`82`	`// And can also manually load the voice from the path you want, as a float array...`
`83`		`- float[,,] michaelNPY = NumSharp.np.Load<float[,,]>(@"voices\am_michael.npy");`
	`83`	`+ float[,,] michaelNPY = NumSharp.np.Load<float[,,]>(@"voices/am_michael.npy");`
`84`	`84`	`tts.EnqueueJob(KokoroJob.Create(ttokens, michaelNPY, speed:0.8f, playback.Enqueue));`
`85`	`85`
`86`	`86`	`// ...or as a KokoroVoice. Those types are fully interchangeable with each other.`
`87`		`- KokoroVoice onyxVoice = KokoroVoice.FromPath(@"voices\am_onyx.npy");`
	`87`	`+ KokoroVoice onyxVoice = KokoroVoice.FromPath(@"voices/am_onyx.npy");`
`88`	`88`	`tts.EnqueueJob(KokoroJob.Create(ttokens, onyxVoice, speed:1.2f, playback.Enqueue));`
`89`	`89`	`}`
`90`	`90`	`}`