Merge remote-tracking branch 'upstream/main'

907043175 · Jan 2, 2023 · f03bbb3 · f03bbb3
2 parents 504d27a + 7cec64b
commit f03bbb3
Show file tree

Hide file tree

Showing 4 changed files with 170 additions and 18 deletions.
diff --git a/.gitignore b/.gitignore
@@ -132,4 +132,4 @@ dmypy.json
 /.idea/
 
 # Tests
-/test.py
+/test*.py
diff --git a/README.md b/README.md
@@ -2,7 +2,14 @@
 
 Python script that slices audio with silence detection
 
-[中文文档](README.zh-CN.md)
+---
+
+This is the 2.0 version of audio slicer, which provides:
+
+- Great improvements on speed (400x compared to previous 15x)
+- Enhanced slicing logic with fewer errors
+
+The 1.0 version can be found [here](https://github.com/openvpi/audio-slicer/tree/old).
 
 ## Screenshots
 
@@ -12,11 +19,11 @@ Python script that slices audio with silence detection
 
 ### Silence detection
 
-This script uses maximum amplitude to measure and detect silence parts in the audio. A **large sliding window** is used to calculate the max amplitude of each specific area in the original audio by convolution. All areas with a maximum amplitude below the threshold will be regarded as silence.
+This script uses RMS (root mean score) to measure the quiteness of the audio and detect silent parts. RMS values of each frame (frame length set as **hop size**) are calculated and all frames with an RMS below the **threshold** will be regarded as silent frames.
 
 ### Audio slicing
 
-Once silence parts are detected, this script uses RMS (root mean score) to determine the specific position where the audio will be sliced. A **small sliding window** is used to search for the best positions to slice the audio, i. e. the position with lowest RMS value. Long silence parts will be deleted.
+Once the valid (sound) part reached **min length** since last slice and a silent part longer than **min interval** are detected, the audio will be sliced apart from the frame(s) with the lowest RMS value within the silent area. Long silence parts may be deleted.
 
 ## Requirements
 
@@ -39,16 +46,16 @@ pip install -r requirements.txt
 import librosa
 import soundfile
 
-from slicer import Slicer
+from slicer2 import Slicer
 
 audio, sr = librosa.load('example.wav', sr=None)  # Load an audio file with librosa
 slicer = Slicer(
     sr=sr,
-    db_threshold=-30,
+    threshold=-40,
     min_length=5000,
-    win_l=400,
-    win_s=20,
-    max_silence_kept=500
+    min_interval=300,
+    hop_size=10,
+    max_sil_kept=500
 )
 chunks = slicer.slice(audio)
 for i, chunk in enumerate(chunks):
@@ -59,8 +66,8 @@ for i, chunk in enumerate(chunks):
 
 The script can be run with CLI as below:
 
-```shell
-python slicer.py audio [--out OUT] [--db_thresh DB_THRESH] [--min_len MIN_LEN] [--win_l WIN_L] [--win_s WIN_S] [--max_sil_kept MAX_SIL_KEPT]
+```bash
+python slicer2.py audio [--out OUT] [--db_thresh DB_THRESH] [--min_length MIN_LENGTH] [--min_interval MIN_INTERVAL] [--hop_size HOP_SIZE] [--max_sil_kept MAX_SIL_KEPT]
 ```
 
 where `audio` refers to the audio to be sliced, `--out` defaults to the same directory as the audio, and other options have default values as listed [here](#Parameters).
@@ -83,25 +90,25 @@ Sampling rate of the input audio.
 
 ### db_threshold
 
-The amplitude threshold presented in dB. Areas where all amplitudes are below this threshold will be regarded as silence. Increase this value if your audio is noisy. Defaults to -40.
+The RMS threshold presented in dB. Areas where all RMS values are below this threshold will be regarded as silence. Increase this value if your audio is noisy. Defaults to -40.
 
 ### min_length
 
 The minimum length required for each sliced audio clip, presented in milliseconds. Defaults to 5000.
 
-### win_l
+### min_interval
 
-Size of the large sliding window, presented in milliseconds. Set this value smaller if your audio contains only short breaks. The smaller this value is, the more sliced audio clips this script is likely to generate. Note that this value must be smaller than min_length and larger than win_s. Defaults to 300.
+The minimum length for a silence part to be sliced, presented in milliseconds. Set this value smaller if your audio contains only short breaks. The smaller this value is, the more sliced audio clips this script is likely to generate. Note that this value must be smaller than min_length and larger than hop_size. Defaults to 300.
 
-### win_s
+### hop_size
 
-Size of the small sliding window, presented in milliseconds. Normally it is not necessary to modify this value. Defaults to 20.
+Length of each RMS frame, presented in milliseconds. Increasing this value will increase the precision of slicing, but will slow down the process. Defaults to 10.
 
 ### max_silence_kept
 
 The maximum silence length kept around the sliced audio, presented in milliseconds. Adjust this value according to your needs. Note that setting this value does not mean that silence parts in the sliced audio have exactly the given length. The algorithm will search for the best position to slice, as described above. Defaults to 1000.
 
 ## Performance
 
-This script contains an $O(n)$ main loop on the Python level, where $n$ refers to the count of audio samples. Besides this bottleneck, all heavy calculation is done by NumPy and SciPy on the C++ level. Thus, this script achieves an RTF (Real-Time Factor) about 0.02~0.10 on an Intel i7 8750H CPU. In addition, as the `Slicer` class is thread-safe, using multi-threading may further speed up the process.
+This script runs over 400x faster than real-time on an Intel i& 8750H CPU. Speed may vary according to your CPU and your disk. Though `Slicer` is thread-safe, multi-threading does not seem neccessary due to the I/O bottleneck.
 
diff --git a/slicer.py b/slicer.py
@@ -12,7 +12,7 @@ def timeit(func):
     def run(*args, **kwargs):
         t = time.time()
         res = func(*args, **kwargs)
-        print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
+        print('executing \'%s\' cost %.3fs' % (func.__name__, time.time() - t))
         return res
     return run
 

diff --git a/slicer2.py b/slicer2.py
@@ -0,0 +1,145 @@
+import os.path
+from argparse import ArgumentParser
+
+import librosa
+import soundfile
+
+
+class Slicer:
+    def __init__(self,
+                 sr: int,
+                 threshold: float = -40.,
+                 min_length: int = 5000,
+                 min_interval: int = 300,
+                 hop_size: int = 20,
+                 max_sil_kept: int = 5000):
+        if not min_length >= min_interval >= hop_size:
+            raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
+        if not max_sil_kept >= hop_size:
+            raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
+        min_interval = sr * min_interval / 1000
+        self.threshold = 10 ** (threshold / 20.)
+        self.hop_size = round(sr * hop_size / 1000)
+        self.win_size = min(round(min_interval), 4 * self.hop_size)
+        self.min_length = round(sr * min_length / 1000 / self.hop_size)
+        self.min_interval = round(min_interval / self.hop_size)
+        self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
+
+    def _apply_slice(self, waveform, begin, end):
+        if len(waveform.shape) > 1:
+            return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
+        else:
+            return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
+
+    # @timeit
+    def slice(self, waveform):
+        if len(waveform.shape) > 1:
+            samples = librosa.to_mono(waveform)
+        else:
+            samples = waveform
+        if samples.shape[0] <= self.min_length:
+            return [waveform]
+        rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
+        sil_tags = []
+        silence_start = None
+        clip_start = 0
+        for i, rms in enumerate(rms_list):
+            # Keep looping while frame is silent.
+            if rms < self.threshold:
+                # Record start of silent frames.
+                if silence_start is None:
+                    silence_start = i
+                continue
+            # Keep looping while frame is not silent and silence start has not been recorded.
+            if silence_start is None:
+                continue
+            # Clear recorded silence start if interval is not enough or clip is too short
+            is_leading_silence = silence_start == 0 and i > self.max_sil_kept
+            need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
+            if not is_leading_silence and not need_slice_middle:
+                silence_start = None
+                continue
+            # Need slicing. Record the range of silent frames to be removed.
+            if i - silence_start <= self.max_sil_kept:
+                pos = rms_list[silence_start: i + 1].argmin() + silence_start
+                if silence_start == 0:
+                    sil_tags.append((0, pos))
+                else:
+                    sil_tags.append((pos, pos))
+                clip_start = pos
+            elif i - silence_start <= self.max_sil_kept * 2:
+                pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
+                pos += i - self.max_sil_kept
+                pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
+                pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                    clip_start = pos_r
+                else:
+                    sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
+                    clip_start = max(pos_r, pos)
+            else:
+                pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
+                pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                else:
+                    sil_tags.append((pos_l, pos_r))
+                clip_start = pos_r
+            silence_start = None
+        # Deal with trailing silence.
+        total_frames = rms_list.shape[0]
+        if silence_start is not None and total_frames - silence_start >= self.min_interval:
+            silence_end = min(total_frames, silence_start + self.max_sil_kept)
+            pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
+            sil_tags.append((pos, total_frames + 1))
+        # Apply and return slices.
+        if len(sil_tags) == 0:
+            return [waveform]
+        else:
+            chunks = []
+            if sil_tags[0][0] > 0:
+                chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
+            for i in range(len(sil_tags) - 1):
+                chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]))
+            if sil_tags[-1][1] < total_frames:
+                chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames))
+            return chunks
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('audio', type=str, help='The audio to be sliced')
+    parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
+    parser.add_argument('--db_thresh', type=float, required=False, default=-40,
+                        help='The dB threshold for silence detection')
+    parser.add_argument('--min_length', type=int, required=False, default=5000,
+                        help='The minimum milliseconds required for each sliced audio clip')
+    parser.add_argument('--min_interval', type=int, required=False, default=300,
+                        help='The minimum milliseconds for a silence part to be sliced')
+    parser.add_argument('--hop_size', type=int, required=False, default=10,
+                        help='Frame length in milliseconds')
+    parser.add_argument('--max_sil_kept', type=int, required=False, default=500,
+                        help='The maximum silence length kept around the sliced clip, presented in milliseconds')
+    args = parser.parse_args()
+    out = args.out
+    if out is None:
+        out = os.path.dirname(os.path.abspath(args.audio))
+    audio, sr = librosa.load(args.audio, sr=None)
+    slicer = Slicer(
+        sr=sr,
+        threshold=args.db_thresh,
+        min_length=args.min_length,
+        min_interval=args.min_interval,
+        hop_size=args.hop_size,
+        max_sil_kept=args.max_sil_kept
+    )
+    chunks = slicer.slice(audio)
+    if not os.path.exists(out):
+        os.makedirs(out)
+    for i, chunk in enumerate(chunks):
+        soundfile.write(os.path.join(out, f'%s_%d.wav' % (os.path.basename(args.audio).rsplit('.', maxsplit=1)[0], i)), chunk, sr)
+
+
+if __name__ == '__main__':
+    main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -132,4 +132,4 @@ dmypy.json @@
     /.idea/
     # Tests
-    /test.py
+    /test*.py