Merge branch 'main' of github.com:facebookresearch/demucs

franzwarning · May 23, 2023 · 5a79870 · 5a79870
2 parents 7eed73b + 51b6545
commit 5a79870
Show file tree

Hide file tree

Showing 15 changed files with 80 additions and 50 deletions.
diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
@@ -4,20 +4,22 @@ on:
     branches: [ main ]
   pull_request:
     branches: [ main ]
+  workflow_dispatch:
 
 jobs:
   build:
     runs-on: ubuntu-latest
+    if: ${{ github.repository == 'facebookresearch/demucs' || github.event_name == 'workflow_dispatch' }}
     steps:
     - uses: actions/checkout@v2
     - uses: actions/setup-python@v2
       with:
-        python-version: 3.7
+        python-version: 3.8
 
     - uses: actions/cache@v2
       with:
         path: env
-        key: env-${{ hashFiles('**/requirements.txt') }}
+        key: env-${{ hashFiles('**/requirements.txt', '.github/workflows/*') }}
 
     - name: Install dependencies
       run: |

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -4,23 +4,26 @@ on:
     branches: [ main ]
   pull_request:
     branches: [ main ]
+  workflow_dispatch:
 
 jobs:
   build:
     runs-on: ubuntu-latest
+    if: ${{ github.repository == 'facebookresearch/demucs' || github.event_name == 'workflow_dispatch' }}
     steps:
     - uses: actions/checkout@v2
     - uses: actions/setup-python@v2
       with:
-        python-version: 3.7
+        python-version: 3.8
 
     - uses: actions/cache@v2
       with:
         path: env
-        key: env-${{ hashFiles('**/requirements.txt') }}
+        key: env-${{ hashFiles('**/requirements.txt', '.github/workflows/*') }}
 
     - name: Install dependencies
       run: |
+        sudo apt-get update
         sudo apt-get install -y ffmpeg
         python3 -m venv env
         . env/bin/activate

diff --git a/Makefile b/Makefile
@@ -17,6 +17,7 @@ test_eval:
 	python3 -m demucs -n demucs_unittest --two-stems=vocals test.mp3
 	python3 -m demucs -n demucs_unittest --mp3 test.mp3
 	python3 -m demucs -n demucs_unittest --int24 --clip-mode clamp test.mp3
+	python3 -m demucs -n demucs_unittest --segment 8 test.mp3
 
 tests/musdb:
 	test -e tests || mkdir tests

diff --git a/README.md b/README.md
@@ -79,28 +79,28 @@ of the naturalness and absence of artifacts given by human listeners (5 = no art
 is a rating from 1 to 5 with 5 being zero contamination by other sources. We refer the reader to our [paper][hybrid_paper],
 for more details.
 
-| Model                        | Domain      | Extra data? | Overall SDR | MOS Quality | MOS Contamination |
-|------------------------------|-------------|-------------|-------------|-------------|-------------------|
-| [Wave-U-Net][waveunet]       | waveform    | no          | 3.2         | -           | -                 |
-| [Open-Unmix][openunmix]      | spectrogram | no          | 5.3         | -           | -                 |
-| [D3Net][d3net]               | spectrogram | no          | 6.0         | -           | -                 |
-| [Conv-Tasnet][demucs_v2]     | waveform    | no          | 5.7         | -           |                   |
-| [Demucs (v2)][demucs_v2]     | waveform    | no          | 6.3         | 2.37        | 2.36              |
-| [ResUNetDecouple+][decouple] | spectrogram | no          | 6.7         | -           | -                 |
-| [KUIELAB-MDX-Net][kuielab]   | hybrid      | no          | 7.5         | **2.86**    | 2.55              |
-| [Band-Spit RNN][bandsplit]   | spectrogram | no          | **8.2**     | -           | -                 |
-| **Hybrid Demucs (v3)**       | hybrid      | no          | 7.7         | **2.83**    | **3.04**          |
-| [MMDenseLSTM][mmdenselstm]   | spectrogram | 804 songs   | 6.0         | -           | -                 |
-| [D3Net][d3net]               | spectrogram | 1.5k songs  | 6.7         | -           | -                 |
-| [Spleeter][spleeter]         | spectrogram | 25k songs   | 5.9         | -           | -                 |
-| [Band-Spit RNN][bandsplit]   | spectrogram | 1.7k (mixes only)     | **9.0**     | -           | -                 |
-| **HT Demucs f.t. (v4)**      | hybrid      | 800 songs   | **9.0**     | -           | -                 |
+| Model                        | Domain      | Extra data?       | Overall SDR | MOS Quality | MOS Contamination |
+|------------------------------|-------------|-------------------|-------------|-------------|-------------------|
+| [Wave-U-Net][waveunet]       | waveform    | no                | 3.2         | -           | -                 |
+| [Open-Unmix][openunmix]      | spectrogram | no                | 5.3         | -           | -                 |
+| [D3Net][d3net]               | spectrogram | no                | 6.0         | -           | -                 |
+| [Conv-Tasnet][demucs_v2]     | waveform    | no                | 5.7         | -           |                   |
+| [Demucs (v2)][demucs_v2]     | waveform    | no                | 6.3         | 2.37        | 2.36              |
+| [ResUNetDecouple+][decouple] | spectrogram | no                | 6.7         | -           | -                 |
+| [KUIELAB-MDX-Net][kuielab]   | hybrid      | no                | 7.5         | **2.86**    | 2.55              |
+| [Band-Spit RNN][bandsplit]   | spectrogram | no                | **8.2**     | -           | -                 |
+| **Hybrid Demucs (v3)**       | hybrid      | no                | 7.7         | **2.83**    | **3.04**          |
+| [MMDenseLSTM][mmdenselstm]   | spectrogram | 804 songs         | 6.0         | -           | -                 |
+| [D3Net][d3net]               | spectrogram | 1.5k songs        | 6.7         | -           | -                 |
+| [Spleeter][spleeter]         | spectrogram | 25k songs         | 5.9         | -           | -                 |
+| [Band-Spit RNN][bandsplit]   | spectrogram | 1.7k (mixes only) | **9.0**     | -           | -                 |
+| **HT Demucs f.t. (v4)**      | hybrid      | 800 songs         | **9.0**     | -           | -                 |
 
 
 
 ## Requirements
 
-You will need at least Python 3.7. See `requirements_minimal.txt` for requirements for separation only,
+You will need at least Python 3.8. See `requirements_minimal.txt` for requirements for separation only,
 and `environment-[cpu|cuda].yml` (or `requirements.txt`) if you want to train a new model.
 
 ### For Windows users
@@ -242,6 +242,20 @@ If you want to use GPU acceleration, you will need at least 3GB of RAM on your G
 
 If you do not have enough memory on your GPU, simply add `-d cpu` to the command line to use the CPU. With Demucs, processing time should be roughly equal to 1.5 times the duration of the track.
 
+## Calling from another Python program
+
+The main function provides a `opt` parameter as a simple API. You can just pass the parsed command line as this parameter: 
+```python
+# Assume that your command is `demucs --mp3 --two-stems vocals -n mdx_extra "track with space.mp3"`
+# The following codes are same as the command above:
+import demucs.separate
+demucs.separate.main(["--mp3", "--two-stems", "vocals", "-n", "mdx_extra", "track with space.mp3"])
+
+# Or like this
+import demucs.separate
+import shlex
+demucs.separate.main(shlex.split('--mp3 --two-stems vocals -n mdx_extra "track with space.mp3"'))
+```
 
 ## Training Demucs
 

diff --git a/demucs/apply.py b/demucs/apply.py
@@ -18,9 +18,10 @@
 
 from .demucs import Demucs
 from .hdemucs import HDemucs
+from .htdemucs import HTDemucs
 from .utils import center_trim, DummyPoolExecutor
 
-Model = tp.Union[Demucs, HDemucs]
+Model = tp.Union[Demucs, HDemucs, HTDemucs]
 
 
 class BagOfModels(nn.Module):
@@ -122,7 +123,7 @@ def tensor_chunk(tensor_or_chunk):
 
 def apply_model(model, mix, shifts=1, split=True,
                 overlap=0.25, transition_power=1., progress=False, device=None,
-                num_workers=0, pool=None):
+                num_workers=0, segment=None, pool=None):
     """
     Apply model to a given mixture.
 
@@ -157,6 +158,7 @@ def apply_model(model, mix, shifts=1, split=True,
         'progress': progress,
         'device': device,
         'pool': pool,
+        'segment': segment,
     }
     if isinstance(model, BagOfModels):
         # Special treatment for bag of model.
@@ -201,7 +203,11 @@ def apply_model(model, mix, shifts=1, split=True,
         kwargs['split'] = False
         out = th.zeros(batch, len(model.sources), channels, length, device=mix.device)
         sum_weight = th.zeros(length, device=mix.device)
-        segment = int(model.samplerate * model.segment)
+        if segment is None:
+            segment = model.segment
+        segment_old = model.segment
+        model.segment = segment
+        segment = int(model.samplerate * segment)
         stride = int((1 - overlap) * segment)
         offsets = range(0, length, stride)
         scale = float(format(stride / model.samplerate, ".2f"))
@@ -227,6 +233,7 @@ def apply_model(model, mix, shifts=1, split=True,
             chunk_length = chunk_out.shape[-1]
             out[..., offset:offset + segment] += (weight[:chunk_length] * chunk_out).to(mix.device)
             sum_weight[offset:offset + segment] += weight[:chunk_length].to(mix.device)
+        model.segment = segment_old
         assert sum_weight.min() > 0
         out /= sum_weight
         return out

diff --git a/demucs/audio.py b/demucs/audio.py
@@ -12,6 +12,7 @@
 import numpy as np
 import torch
 import torchaudio as ta
+import typing as tp
 
 from .utils import temp_filenames
 
@@ -72,8 +73,7 @@ def read(self,
              duration=None,
              streams=slice(None),
              samplerate=None,
-             channels=None,
-             temp_folder=None):
+             channels=None):
         """
         Slightly more efficient implementation than stempeg,
         in particular, this will extract all stems at once
@@ -94,9 +94,6 @@ def read(self,
                 See https://sound.stackexchange.com/a/42710.
                 Our definition of mono is simply the average of the two channels. Any other
                 value will be ignored.
-            temp_folder (str or Path or None): temporary folder to use for decoding.
-
-
         """
         streams = np.array(range(len(self)))[streams]
         single = not isinstance(streams, np.ndarray)
@@ -222,6 +219,8 @@ def prevent_clip(wav, mode='rescale'):
     """
     different strategies for avoiding raw clipping.
     """
+    if mode is None or mode == 'none':
+        return wav
     assert wav.dtype.is_floating_point, "too late for clipping"
     if mode == 'rescale':
         wav = wav / max(1.01 * wav.abs().max(), 1)
@@ -234,8 +233,13 @@ def prevent_clip(wav, mode='rescale'):
     return wav
 
 
-def save_audio(wav, path, samplerate, bitrate=320, clip='rescale',
-               bits_per_sample=16, as_float=False):
+def save_audio(wav: torch.Tensor,
+               path: tp.Union[str, Path],
+               samplerate: int,
+               bitrate: int = 320,
+               clip: tp.Literal["rescale", "clamp", "tanh", "none"] = 'rescale',
+               bits_per_sample: tp.Literal[16, 24, 32] = 16,
+               as_float: bool = False):
     """Save audio file, automatically preventing clipping if necessary
     based on the given `clip` strategy. If the path ends in `.mp3`, this
     will save as mp3 with the given `bitrate`.

diff --git a/demucs/demucs.py b/demucs/demucs.py
@@ -285,7 +285,7 @@ def __init__(self,
             normalize (bool): normalizes the input audio on the fly, and scales back
                 the output by the same amount.
             resample (bool): upsample x2 the input and downsample /2 the output.
-            rescale (int): rescale initial weights of convolutions
+            rescale (float): rescale initial weights of convolutions
                 to get their standard deviation closer to `rescale`.
             samplerate (int): stored as meta information for easing
                 future evaluations of the model.

diff --git a/demucs/pretrained.py b/demucs/pretrained.py
@@ -32,7 +32,7 @@ def add_model_flags(parser):
     group = parser.add_mutually_exclusive_group(required=False)
     group.add_argument("-s", "--sig", help="Locally trained XP signature.")
     group.add_argument("-n", "--name", default=None,
-                       help="Pretrained model name or signature. Default is mdx_extra_q.")
+                       help="Pretrained model name or signature. Default is htdemucs.")
     parser.add_argument("--repo", type=Path,
                         help="Folder containing all pre-trained models for use with -n.")
 

diff --git a/demucs/separate.py b/demucs/separate.py
@@ -49,7 +49,7 @@ def load_track(track, audio_channels, samplerate):
     return wav
 
 
-def main():
+def get_parser():
     parser = argparse.ArgumentParser("demucs.separate",
                                      description="Separate the sources for the given tracks")
     parser.add_argument("tracks", nargs='+', type=Path, default=[], help='Path to tracks')
@@ -115,7 +115,12 @@ def main():
                         help="Number of jobs. This can increase memory usage but will "
                              "be much faster when multiple cores are available.")
 
-    args = parser.parse_args()
+    return parser
+
+
+def main(opts=None):
+    parser = get_parser()
+    args = parser.parse_args(opts)
 
     try:
         model = get_model_from_args(args)
@@ -131,12 +136,6 @@ def main():
     if isinstance(model, BagOfModels):
         print(f"Selected model is a bag of {len(model.models)} models. "
               "You will see that many progress bars per track.")
-        if args.segment is not None:
-            for sub in model.models:
-                sub.segment = args.segment
-    else:
-        if args.segment is not None:
-            model.segment = args.segment
 
     model.cpu()
     model.eval()
@@ -162,7 +161,7 @@ def main():
         wav = (wav - ref.mean()) / ref.std()
         sources = apply_model(model, wav[None], device=args.device, shifts=args.shifts,
                               split=args.split, overlap=args.overlap, progress=True,
-                              num_workers=args.jobs)[0]
+                              num_workers=args.jobs, segment=args.segment)[0]
         sources = sources * ref.std() + ref.mean()
 
         if args.mp3:

diff --git a/docs/linux.md b/docs/linux.md
@@ -1,6 +1,6 @@
 # Linux support for Demucs
 
-If your distribution has at least Python 3.7, and you just wish to separate
+If your distribution has at least Python 3.8, and you just wish to separate
 tracks with Demucs, not train it, you can just run
 
 ```bash
@@ -11,7 +11,7 @@ python3 -m demucs -d cpu PATH_TO_AUDIO_FILE_1
 demucs -d cpu PATH_TO_AUDIO_FILE_1
 ```
 
-If Python is too old, or you want to be able to train, I recommend [installing Miniconda][miniconda], with Python 3.7 or more.
+If Python is too old, or you want to be able to train, I recommend [installing Miniconda][miniconda], with Python 3.8 or more.
 
 ```bash
 conda activate

diff --git a/docs/windows.md b/docs/windows.md
@@ -4,7 +4,7 @@
 
 Parts of the code are untested on Windows (in particular, training a new model). If you don't have much experience with Anaconda, python or the shell, here are more detailed instructions. Note that **Demucs is not supported on 32bits systems** (as Pytorch is not available there).
 
-- First install Anaconda with **Python 3.7** or more recent, which you can find [here][install].
+- First install Anaconda with **Python 3.8** or more recent, which you can find [here][install].
 - Start the [Anaconda prompt][prompt].
 
 Then, all commands that follow must be run from this prompt.

diff --git a/environment-cpu.yml b/environment-cpu.yml
@@ -5,7 +5,7 @@ channels:
   - conda-forge
 
 dependencies:
-  - python>=3.7,<3.10
+  - python>=3.8,<3.10
   - ffmpeg>=4.2
   - pytorch>=1.8.1
   - torchaudio>=0.8

diff --git a/environment-cuda.yml b/environment-cuda.yml
@@ -5,7 +5,7 @@ channels:
   - conda-forge
 
 dependencies:
-  - python>=3.7,<3.10
+  - python>=3.8,<3.10
   - ffmpeg>=4.2
   - pytorch>=1.8.1
   - torchaudio>=0.8

diff --git a/setup.py b/setup.py
@@ -17,7 +17,7 @@
 URL = 'https://github.com/facebookresearch/demucs'
 EMAIL = '[email protected]'
 AUTHOR = 'Alexandre Défossez'
-REQUIRES_PYTHON = '>=3.7.0'
+REQUIRES_PYTHON = '>=3.8.0'
 
 HERE = Path(__file__).parent
 

diff --git a/tools/automix.py b/tools/automix.py
@@ -78,7 +78,7 @@ def analyse_track(dset, index):
     if cached is None:
         drums = track[0].mean(0)
         if drums.std() > 1e-2 * ref:
-            tempo, events = beat_track(drums.numpy(), units='time', sr=SR)
+            tempo, events = beat_track(y=drums.numpy(), units='time', sr=SR)
         else:
             print("failed drums", drums.std(), ref)
             return None, track
@@ -89,7 +89,7 @@ def analyse_track(dset, index):
         mask = r >= 0.05 * peak
         bass = bass[mask]
         if bass.std() > 1e-2 * ref:
-            kr = torch.from_numpy(chroma_cqt(bass.numpy(), sr=SR))
+            kr = torch.from_numpy(chroma_cqt(y=bass.numpy(), sr=SR))
             hist_kr = (kr.max(dim=0, keepdim=True)[0] == kr).float().mean(1)
         else:
             print("failed bass", bass.std(), ref)