Skip to content

Commit 7950544

Browse files
committed
[v0.5.0] ULTRA-nice speech & segmentation. Especially with NewLines.
1 parent 61be00c commit 7950544

File tree

7 files changed

+110
-43
lines changed

7 files changed

+110
-43
lines changed

Core/KokoroModel.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ public float[] Infer(int[] tokens, float[,,] voiceStyle, float speed = 1) {
4141
Array.Copy(tokens, 0, inputTokens, 1, T); // [0] and [^1] stay as zeroes.
4242

4343
for (int j = 0; j < C; j++) { styleTensor[0, j] = voiceStyle[T - 1, 0, j]; }
44-
for (int i = 0; i < inputTokens.Length; i++) { tokenTensor[0, i] = inputTokens[i]; }
44+
for (int i = 0; i < inputTokens.Length; i++) { tokenTensor[0, i] = (inputTokens[i] >= 0 ? inputTokens[i] : 4); } // [unk] --> '.'
4545

4646
var inputs = new List<NamedOnnxValue> { GetOnnxValue("tokens", tokenTensor), GetOnnxValue("style", styleTensor), GetOnnxValue("speed", speedTensor) };
4747
lock (session) {

HighLevel/KokoroTTS.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ public SynthesisHandle Speak_Phonemes(string text, int[] tokens, KokoroVoice voi
7373
currentHandle = new SynthesisHandle() { Job = job, TextToSpeak = text };
7474
foreach (var step in job.Steps) {
7575
step.OnStepComplete = (samples) => EnqueueWithCallbacks(samples, text, ttokens, step, job, currentHandle, pipelineConfig, phonemesCache);
76-
Debug.WriteLine($"[step {job.Steps.IndexOf(step)}: {new string(step.Tokens.Select(x => Tokenizer.TokenToChar[x]).ToArray())}]");
76+
Debug.WriteLine($"[step {job.Steps.IndexOf(step)}: {new string(step.Tokens.Select(x => Tokenizer.TokenToChar[x]).ToArray())}]".Replace("\n", "®"));
7777
}
7878
return currentHandle;
7979
}

KokoroSharp.csproj

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
</PropertyGroup>
99

1010
<PropertyGroup>
11-
<Version>0.4.9</Version>
11+
<Version>0.5.1</Version>
1212
<PackageId>KokoroSharp</PackageId>
1313
<Authors>Lyrcaxis</Authors>
1414
<Description>An inference engine for Kokoro TTS with ONNX runtime, enabling fast and flexible local text-to-speech (fp/quanted) purely via C#. It features segment streaming, voice mixing, linear job scheduling, and optional playback.</Description>

Processing/SegmentationSystem.cs

+13-6
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
/// <summary> Helper class that allows turning text tokens into segments, allowing us to get the first response of the model quicker. </summary>
1010
/// <remarks> This allows us to begin playing back the audio of the first sentence, while the model processes the rest of the sequence on the background. </remarks>
1111
public static class SegmentationSystem {
12+
static int NLToken = Vocab['\n'];
1213
static HashSet<int> properEndSeqTokens = [Vocab['.'], Vocab['!'], Vocab['?'], Vocab[':']];
1314
static HashSet<int> fallbackEndTokens = [Vocab[','], Vocab[' ']];
1415

@@ -28,6 +29,10 @@ public static List<int[]> SplitToSegments(int[] tokens, DefaultSegmentationConfi
2829
var (min, max, _) = GetSegmentRange(segmentsList.Count);
2930
for (int i = 0; i < max && (totalTokensProcessed + i < tokens.Length); i++) { reusableTempList.Add(tokens[totalTokensProcessed + i]); }
3031

32+
// If there's a newline token, just end it! Do not look further! It's the perfect place to segment.
33+
if (reusableTempList.Contains(NLToken)) { AddRange(reusableTempList.IndexOf(NLToken) + 1); }
34+
if (reusableTempList.Count == 0) { continue; }
35+
3136
foreach (var endSeqToken in properEndSeqTokens) { // Check if we can end the sequence properly here.
3237
if (reusableTempList.Contains(endSeqToken)) { // They are ordered by highest preference. Periods are nice to end it.
3338
AddRange((segmentsList.Count >= 2) ? reusableTempList.LastIndexOf(endSeqToken) : reusableTempList.IndexOf(endSeqToken));
@@ -39,7 +44,7 @@ public static List<int[]> SplitToSegments(int[] tokens, DefaultSegmentationConfi
3944
// If there was no *proper* end_seq punctuation [.:!?] found on the phrase, we can start searching for fallback punctuation.
4045
foreach (var fallbackEndToken in fallbackEndTokens) { // This includes comma and space at the moment, in this order.
4146
if (reusableTempList.Contains(fallbackEndToken)) { // So, a split on a 'comma' character will be prefered over a split on 'space'.
42-
AddRange((segmentsList.Count >= 1) ? reusableTempList.LastIndexOf(fallbackEndToken) : reusableTempList.IndexOf(fallbackEndToken));
47+
AddRange(reusableTempList.LastIndexOf(fallbackEndToken));
4348
break; // For the first segment, we'll take the FIRST occassion for a quick response. For the rest, the last occassion.
4449
}
4550
}
@@ -66,21 +71,23 @@ public static List<int[]> SplitToSegments(int[] tokens, DefaultSegmentationConfi
6671
return segmentsList;
6772

6873
void AddRange(int count) {
74+
count = Math.Max(count, 1);
6975
int end() => totalTokensProcessed + count;
70-
while ((end() < tokens.Length) && (properEndSeqTokens.Contains(tokens[end()]) || fallbackEndTokens.Contains(tokens[end()]))) { count++; }
76+
var x = tokens[end()];
77+
while (end() < tokens.Length && tokens[end()] != NLToken && (properEndSeqTokens.Contains(tokens[end()]) || fallbackEndTokens.Contains(tokens[end()]))) { count++; }
7178

7279
var newEnd = Math.Min(end(), tokens.Length - 1);
7380
while (newEnd > totalTokensProcessed && tokens[newEnd - 1] == Vocab[' ']) { newEnd--; }
74-
if (Math.Abs(newEnd - tokens.Length) < 20) { count += (tokens.Length - newEnd); newEnd = tokens.Length; }
75-
if (newEnd > totalTokensProcessed) { segmentsList.Add([.. tokens[totalTokensProcessed..newEnd]]); }
76-
Debug.WriteLine($"[{segmentsList.Count}](+{count} [{totalTokensProcessed}/{tokens.Length}]): {new string(tokens[totalTokensProcessed..newEnd].Select(x => TokenToChar[x]).ToArray())}");
81+
if (tokens[newEnd] != NLToken && Math.Abs(newEnd - tokens.Length) < 20) { count += (tokens.Length - newEnd); newEnd = tokens.Length; }
82+
if (newEnd > totalTokensProcessed + 1) { segmentsList.Add([.. tokens[totalTokensProcessed..newEnd]]); }
83+
Debug.WriteLine($"[{segmentsList.Count}](+{count} [{totalTokensProcessed}/{tokens.Length}]): {new string(tokens[totalTokensProcessed..newEnd].Select(x => TokenToChar[x]).ToArray())}".Replace("\n", "®"));
7784
totalTokensProcessed += count;
7885
reusableTempList.Clear();
7986
}
8087

8188
(int min, int max, int _) GetSegmentRange(int segmentIndex) {
8289
var ss = segmentationStrategy;
83-
if (segmentIndex == 0) { return (ss.MinFirstSegmentLength, ss.MaxFirstSegmentLength, (ss.MaxFirstSegmentLength - ss.MinFirstSegmentLength) / 2); }
90+
if (segmentIndex == 0) { return (Math.Min(ss.MinFirstSegmentLength, 3), ss.MaxFirstSegmentLength, (ss.MaxFirstSegmentLength - ss.MinFirstSegmentLength) / 2); }
8491
else if (segmentIndex == 1) { return (0, ss.MaxSecondSegmentLength, ss.MaxSecondSegmentLength); }
8592
else { return (ss.MinFollowupSegmentsLength, Math.Min(ss.MinFollowupSegmentsLength * 2, KokoroModel.maxTokens), ss.MinFollowupSegmentsLength); }
8693
}

Processing/Strategies.cs

+11-7
Original file line numberDiff line numberDiff line change
@@ -38,22 +38,26 @@ public KokoroTTSPipelineConfig() { }
3838
public KokoroTTSPipelineConfig(DefaultSegmentationConfig segmentationConfig) : this() => SegmentationFunc = (t) => SegmentationSystem.SplitToSegments(t, segmentationConfig);
3939
}
4040

41-
/// <summary> Helper class that allows defining amount of seconds will be injected as empty audio between segments that end in a proper punctuation. </summary>
42-
/// <remarks> This'll allow us to emulate natural pause even on the nicified audio (<see cref="KokoroPlayback.NicifySamples"/>). <b>NOTE:</b> Segments that end on a space or mid-word will <b>NOT</b> get any additional pause. </remarks>
41+
/// <summary>
42+
/// <para> Helper class that allows defining amount of seconds will be injected as empty audio between segments that end in a proper punctuation. </para>
43+
/// <para> This will allow us to emulate natural pause even on the nicified audio (<see cref="KokoroPlayback.NicifySamples"/>). </para>
44+
/// <b>NOTE:</b> Only segments that END with one of the letters will receive artificial pauses. Segments that just "speak" one of the ending tokens will not be affected.
45+
/// </summary>
4346
public class PauseAfterSegmentStrategy {
4447
/// <summary> The amount of seconds that should be waited after a segment with specific punctuation on the end was spoken. </summary>
45-
public float this[char c] => endingPunctuationPauseSecondsMap[c];
48+
public float this[char c] => endingPunctuationPauseSecondsMap.TryGetValue(c, out var p) ? p : endingPunctuationPauseSecondsMap['¿'];
4649

4750
/// <summary> A map containing the amount of seconds that should be waited after a segment with specific punctuation on the end was spoken. </summary>
4851
IReadOnlyDictionary<char, float> endingPunctuationPauseSecondsMap { get; }
4952

50-
public PauseAfterSegmentStrategy(float CommaPause = 0.1f, float PeriodPause = 0.5f, float QuestionmarkPause = 0.5f, float ExclamationMarkPause = 0.5f, float OthersPause = 0.5f) {
53+
public PauseAfterSegmentStrategy(float CommaPause = 0.1f, float PeriodPause = 0.5f, float QuestionmarkPause = 0.5f, float ExclamationMarkPause = 0.5f, float NewLinePause = 0.5f, float OthersPause = 0.5f) {
5154
endingPunctuationPauseSecondsMap = new Dictionary<char, float>() {
5255
{ ',', CommaPause },
5356
{ '.', PeriodPause },
5457
{ '?', QuestionmarkPause },
5558
{ '!', ExclamationMarkPause },
56-
{ ':', OthersPause }
59+
{ '\n', NewLinePause },
60+
{ '¿', OthersPause }
5761
};
5862
}
5963
}
@@ -73,11 +77,11 @@ public PauseAfterSegmentStrategy(float CommaPause = 0.1f, float PeriodPause = 0.
7377
public class DefaultSegmentationConfig {
7478
/// <summary> The minimum allowed length of the first segment. Ensures the first segment includes AT LEAST this many tokens. </summary>
7579
/// <remarks> Recommended to keep this small, to allow instant responses. </remarks>
76-
public int MinFirstSegmentLength = 1;
80+
public int MinFirstSegmentLength = 10;
7781

7882
/// <summary> The maximum allowed length of the first segment. *NOTE: Having this too small might cut words in the middle* </summary>
7983
/// <remarks> Recommended to keep this small, but not too small, to allow instant responses. </remarks>
80-
public int MaxFirstSegmentLength = 40;
84+
public int MaxFirstSegmentLength = 100;
8185

8286
/// <summary> The maximum allowed length of the second segment. *NOTE: Having this too small might cut words in the middle* </summary>
8387
/// <remarks> Recommended to be a reasonable size based on the first segment's expected length, for seamless audio playback. </remarks>

0 commit comments

Comments
 (0)