[v0.5.0] ULTRA-nice speech & segmentation. Especially with NewLines.

Lyrcaxis · Lyrcaxis · commit 795054464e48 · 2025-02-07T22:52:21.000+02:00
diff --git a/Core/KokoroModel.cs b/Core/KokoroModel.cs
@@ -41,7 +41,7 @@ public float[] Infer(int[] tokens, float[,,] voiceStyle, float speed = 1) {
         Array.Copy(tokens, 0, inputTokens, 1, T); // [0] and [^1] stay as zeroes.
 
         for (int j = 0; j < C; j++) { styleTensor[0, j] = voiceStyle[T - 1, 0, j]; }
-        for (int i = 0; i < inputTokens.Length; i++) { tokenTensor[0, i] = inputTokens[i]; }
+        for (int i = 0; i < inputTokens.Length; i++) { tokenTensor[0, i] = (inputTokens[i] >= 0 ? inputTokens[i] : 4); } // [unk] --> '.'
 
         var inputs = new List<NamedOnnxValue> { GetOnnxValue("tokens", tokenTensor), GetOnnxValue("style", styleTensor), GetOnnxValue("speed", speedTensor) };
         lock (session) {
diff --git a/HighLevel/KokoroTTS.cs b/HighLevel/KokoroTTS.cs
@@ -73,7 +73,7 @@ public SynthesisHandle Speak_Phonemes(string text, int[] tokens, KokoroVoice voi
         currentHandle = new SynthesisHandle() { Job = job, TextToSpeak = text };
         foreach (var step in job.Steps) {
             step.OnStepComplete = (samples) => EnqueueWithCallbacks(samples, text, ttokens, step, job, currentHandle, pipelineConfig, phonemesCache);
-            Debug.WriteLine($"[step {job.Steps.IndexOf(step)}: {new string(step.Tokens.Select(x => Tokenizer.TokenToChar[x]).ToArray())}]");
+            Debug.WriteLine($"[step {job.Steps.IndexOf(step)}: {new string(step.Tokens.Select(x => Tokenizer.TokenToChar[x]).ToArray())}]".Replace("\n", "®"));
         }
         return currentHandle;
     }
diff --git a/KokoroSharp.csproj b/KokoroSharp.csproj
@@ -8,7 +8,7 @@
     </PropertyGroup>
 
     <PropertyGroup>
-        <Version>0.4.9</Version>
+        <Version>0.5.1</Version>
         <PackageId>KokoroSharp</PackageId>
         <Authors>Lyrcaxis</Authors>
         <Description>An inference engine for Kokoro TTS with ONNX runtime, enabling fast and flexible local text-to-speech (fp/quanted) purely via C#. It features segment streaming, voice mixing, linear job scheduling, and optional playback.</Description>
diff --git a/Processing/SegmentationSystem.cs b/Processing/SegmentationSystem.cs
@@ -9,6 +9,7 @@
 /// <summary> Helper class that allows turning text tokens into segments, allowing us to get the first response of the model quicker. </summary>
 /// <remarks> This allows us to begin playing back the audio of the first sentence, while the model processes the rest of the sequence on the background. </remarks>
 public static class SegmentationSystem {
+    static int NLToken = Vocab['\n'];
     static HashSet<int> properEndSeqTokens = [Vocab['.'], Vocab['!'], Vocab['?'], Vocab[':']];
     static HashSet<int> fallbackEndTokens = [Vocab[','], Vocab[' ']];
 
@@ -28,6 +29,10 @@ public static List<int[]> SplitToSegments(int[] tokens, DefaultSegmentationConfi
             var (min, max, _) = GetSegmentRange(segmentsList.Count);
             for (int i = 0; i < max && (totalTokensProcessed + i < tokens.Length); i++) { reusableTempList.Add(tokens[totalTokensProcessed + i]); }
 
+            // If there's a newline token, just end it! Do not look further! It's the perfect place to segment.
+            if (reusableTempList.Contains(NLToken)) { AddRange(reusableTempList.IndexOf(NLToken) + 1); }
+            if (reusableTempList.Count == 0) { continue; }
+
             foreach (var endSeqToken in properEndSeqTokens) { // Check if we can end the sequence properly here.
                 if (reusableTempList.Contains(endSeqToken)) { // They are ordered by highest preference. Periods are nice to end it.
                     AddRange((segmentsList.Count >= 2) ? reusableTempList.LastIndexOf(endSeqToken) : reusableTempList.IndexOf(endSeqToken));
@@ -39,7 +44,7 @@ public static List<int[]> SplitToSegments(int[] tokens, DefaultSegmentationConfi
             // If there was no *proper* end_seq punctuation [.:!?] found on the phrase, we can start searching for fallback punctuation.
             foreach (var fallbackEndToken in fallbackEndTokens) {  // This includes comma and space at the moment, in this order.
                 if (reusableTempList.Contains(fallbackEndToken)) { // So, a split on a 'comma' character will be prefered over a split on 'space'.
-                    AddRange((segmentsList.Count >= 1) ? reusableTempList.LastIndexOf(fallbackEndToken) : reusableTempList.IndexOf(fallbackEndToken));
+                    AddRange(reusableTempList.LastIndexOf(fallbackEndToken));
                     break; // For the first segment, we'll take the FIRST occassion for a quick response. For the rest, the last occassion.
                 }
             }
@@ -66,21 +71,23 @@ public static List<int[]> SplitToSegments(int[] tokens, DefaultSegmentationConfi
         return segmentsList;
 
         void AddRange(int count) {
+            count = Math.Max(count, 1);
             int end() => totalTokensProcessed + count;
-            while ((end() < tokens.Length) && (properEndSeqTokens.Contains(tokens[end()]) || fallbackEndTokens.Contains(tokens[end()]))) { count++; }
+            var x = tokens[end()];
+            while (end() < tokens.Length && tokens[end()] != NLToken && (properEndSeqTokens.Contains(tokens[end()]) || fallbackEndTokens.Contains(tokens[end()]))) { count++; }
 
             var newEnd = Math.Min(end(), tokens.Length - 1);
             while (newEnd > totalTokensProcessed && tokens[newEnd - 1] == Vocab[' ']) { newEnd--; }
-            if (Math.Abs(newEnd - tokens.Length) < 20) { count += (tokens.Length - newEnd); newEnd = tokens.Length; }
-            if (newEnd > totalTokensProcessed) { segmentsList.Add([.. tokens[totalTokensProcessed..newEnd]]); }
-            Debug.WriteLine($"[{segmentsList.Count}](+{count} [{totalTokensProcessed}/{tokens.Length}]): {new string(tokens[totalTokensProcessed..newEnd].Select(x => TokenToChar[x]).ToArray())}");
+            if (tokens[newEnd] != NLToken && Math.Abs(newEnd - tokens.Length) < 20) { count += (tokens.Length - newEnd); newEnd = tokens.Length; }
+            if (newEnd > totalTokensProcessed + 1) { segmentsList.Add([.. tokens[totalTokensProcessed..newEnd]]); }
+            Debug.WriteLine($"[{segmentsList.Count}](+{count} [{totalTokensProcessed}/{tokens.Length}]): {new string(tokens[totalTokensProcessed..newEnd].Select(x => TokenToChar[x]).ToArray())}".Replace("\n", "®"));
             totalTokensProcessed += count;
             reusableTempList.Clear();
         }
 
         (int min, int max, int _) GetSegmentRange(int segmentIndex) {
             var ss = segmentationStrategy;
-            if (segmentIndex == 0) { return (ss.MinFirstSegmentLength, ss.MaxFirstSegmentLength, (ss.MaxFirstSegmentLength - ss.MinFirstSegmentLength) / 2); }
+            if (segmentIndex == 0) { return (Math.Min(ss.MinFirstSegmentLength, 3), ss.MaxFirstSegmentLength, (ss.MaxFirstSegmentLength - ss.MinFirstSegmentLength) / 2); }
             else if (segmentIndex == 1) { return (0, ss.MaxSecondSegmentLength, ss.MaxSecondSegmentLength); }
             else { return (ss.MinFollowupSegmentsLength, Math.Min(ss.MinFollowupSegmentsLength * 2, KokoroModel.maxTokens), ss.MinFollowupSegmentsLength); }
         }
diff --git a/Processing/Strategies.cs b/Processing/Strategies.cs
@@ -38,22 +38,26 @@ public KokoroTTSPipelineConfig() { }
     public KokoroTTSPipelineConfig(DefaultSegmentationConfig segmentationConfig) : this() => SegmentationFunc = (t) => SegmentationSystem.SplitToSegments(t, segmentationConfig);
 }
 
-/// <summary> Helper class that allows defining amount of seconds will be injected as empty audio between segments that end in a proper punctuation. </summary>
-/// <remarks> This'll allow us to emulate natural pause even on the nicified audio (<see cref="KokoroPlayback.NicifySamples"/>). <b>NOTE:</b> Segments that end on a space or mid-word will <b>NOT</b> get any additional pause. </remarks>
+/// <summary>
+/// <para> Helper class that allows defining amount of seconds will be injected as empty audio between segments that end in a proper punctuation. </para>
+/// <para> This will allow us to emulate natural pause even on the nicified audio (<see cref="KokoroPlayback.NicifySamples"/>). </para>
+/// <b>NOTE:</b> Only segments that END with one of the letters will receive artificial pauses. Segments that just "speak" one of the ending tokens will not be affected.
+/// </summary>
 public class PauseAfterSegmentStrategy {
     /// <summary> The amount of seconds that should be waited after a segment with specific punctuation on the end was spoken. </summary>
-    public float this[char c] => endingPunctuationPauseSecondsMap[c];
+    public float this[char c] => endingPunctuationPauseSecondsMap.TryGetValue(c, out var p) ? p : endingPunctuationPauseSecondsMap['¿'];
 
     /// <summary> A map containing the amount of seconds that should be waited after a segment with specific punctuation on the end was spoken. </summary>
     IReadOnlyDictionary<char, float> endingPunctuationPauseSecondsMap { get; }
 
-    public PauseAfterSegmentStrategy(float CommaPause = 0.1f, float PeriodPause = 0.5f, float QuestionmarkPause = 0.5f, float ExclamationMarkPause = 0.5f, float OthersPause = 0.5f) {
+    public PauseAfterSegmentStrategy(float CommaPause = 0.1f, float PeriodPause = 0.5f, float QuestionmarkPause = 0.5f, float ExclamationMarkPause = 0.5f, float NewLinePause = 0.5f, float OthersPause = 0.5f) {
         endingPunctuationPauseSecondsMap = new Dictionary<char, float>() {
             { ',', CommaPause },
             { '.', PeriodPause },
             { '?', QuestionmarkPause },
             { '!', ExclamationMarkPause },
-            { ':', OthersPause }
+            { '\n', NewLinePause },
+            { '¿', OthersPause }
         };
     }
 }
@@ -73,11 +77,11 @@ public PauseAfterSegmentStrategy(float CommaPause = 0.1f, float PeriodPause = 0.
 public class DefaultSegmentationConfig {
     /// <summary> The minimum allowed length of the first segment. Ensures the first segment includes AT LEAST this many tokens. </summary>
     /// <remarks> Recommended to keep this small, to allow instant responses. </remarks>
-    public int MinFirstSegmentLength = 1;
+    public int MinFirstSegmentLength = 10;
 
     /// <summary> The maximum allowed length of the first segment. *NOTE: Having this too small might cut words in the middle* </summary>
     /// <remarks> Recommended to keep this small, but not too small, to allow instant responses. </remarks>
-    public int MaxFirstSegmentLength = 40;
+    public int MaxFirstSegmentLength = 100;
 
     /// <summary> The maximum allowed length of the second segment. *NOTE: Having this too small might cut words in the middle* </summary>
     /// <remarks> Recommended to be a reasonable size based on the first segment's expected length, for seamless audio playback. </remarks>
diff --git a/Processing/Tokenizer.cs b/Processing/Tokenizer.cs
diff --git a/README.md b/README.md

Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ public SynthesisHandle Speak_Phonemes(string text, int[] tokens, KokoroVoice voi`
`73`	`73`	`currentHandle = new SynthesisHandle() { Job = job, TextToSpeak = text };`
`74`	`74`	`foreach (var step in job.Steps) {`
`75`	`75`	`step.OnStepComplete = (samples) => EnqueueWithCallbacks(samples, text, ttokens, step, job, currentHandle, pipelineConfig, phonemesCache);`
`76`		`- Debug.WriteLine($"[step {job.Steps.IndexOf(step)}: {new string(step.Tokens.Select(x => Tokenizer.TokenToChar[x]).ToArray())}]");`
	`76`	`+ Debug.WriteLine($"[step {job.Steps.IndexOf(step)}: {new string(step.Tokens.Select(x => Tokenizer.TokenToChar[x]).ToArray())}]".Replace("\n", "®"));`
`77`	`77`	`}`
`78`	`78`	`return currentHandle;`
`79`	`79`	`}`