pytorch · NicolasHug · May 22, 2025 · May 21, 2025 · May 22, 2025 · May 22, 2025
diff --git a/test/test_encoders.py b/test/test_encoders.py
@@ -1,13 +1,28 @@
 import re
+import subprocess
 
 import pytest
 import torch
+from torchcodec.decoders import AudioDecoder
 
 from torchcodec.encoders import AudioEncoder
 
+from .utils import (
+    get_ffmpeg_major_version,
+    in_fbcode,
+    NASA_AUDIO_MP3,
+    SINE_MONO_S32,
+    TestContainerFile,
+)
+
 
 class TestAudioEncoder:
 
+    def decode(self, source) -> torch.Tensor:
+        if isinstance(source, TestContainerFile):
+            source = str(source.path)
+        return AudioDecoder(source).get_all_samples().data
+
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Expected samples to be a Tensor"):
             AudioEncoder(samples=123, sample_rate=32_000)
@@ -39,3 +54,210 @@ def test_bad_input(self):
             match=re.escape(f"Check the desired format? Got format={bad_format}"),
         ):
             encoder.to_tensor(format=bad_format)
+
+    @pytest.mark.parametrize("method", ("to_file", "to_tensor"))
+    def test_bad_input_parametrized(self, method):
+        valid_params = (
+            dict(dest="output.mp3") if method == "to_file" else dict(format="mp3")
+        )
+
+        decoder = AudioEncoder(self.decode(NASA_AUDIO_MP3), sample_rate=10)
+        with pytest.raises(RuntimeError, match="invalid sample rate=10"):
+            getattr(decoder, method)(**valid_params)
+
+        decoder = AudioEncoder(
+            self.decode(NASA_AUDIO_MP3), sample_rate=NASA_AUDIO_MP3.sample_rate
+        )
+        with pytest.raises(RuntimeError, match="bit_rate=-1 must be >= 0"):
+            getattr(decoder, method)(**valid_params, bit_rate=-1)
+
+        bad_num_channels = 10
+        decoder = AudioEncoder(torch.rand(bad_num_channels, 20), sample_rate=16_000)
+        with pytest.raises(
+            RuntimeError, match=f"Trying to encode {bad_num_channels} channels"
+        ):
+            getattr(decoder, method)(**valid_params)
+
+        decoder = AudioEncoder(
+            self.decode(NASA_AUDIO_MP3), sample_rate=NASA_AUDIO_MP3.sample_rate
+        )
+        for num_channels in (0, 3):
+            with pytest.raises(
+                RuntimeError,
+                match=re.escape(
+                    f"Desired number of channels ({num_channels}) is not supported"
+                ),
+            ):
+                getattr(decoder, method)(**valid_params, num_channels=num_channels)
+
+    @pytest.mark.parametrize("method", ("to_file", "to_tensor"))
+    @pytest.mark.parametrize("format", ("wav", "flac"))
+    def test_round_trip(self, method, format, tmp_path):
+        # Check that decode(encode(samples)) == samples on lossless formats
+
+        if get_ffmpeg_major_version() == 4 and format == "wav":
+            pytest.skip("Swresample with FFmpeg 4 doesn't work on wav files")
+
+        asset = NASA_AUDIO_MP3
+        source_samples = self.decode(asset)
+
+        encoder = AudioEncoder(source_samples, sample_rate=asset.sample_rate)
+
+        if method == "to_file":
+            encoded_path = str(tmp_path / f"output.{format}")
+            encoded_source = encoded_path
+            encoder.to_file(dest=encoded_path)
+        else:
+            encoded_source = encoder.to_tensor(format=format)
+            assert encoded_source.dtype == torch.uint8
+            assert encoded_source.ndim == 1
+
+        rtol, atol = (0, 1e-4) if format == "wav" else (None, None)
+        torch.testing.assert_close(
+            self.decode(encoded_source), source_samples, rtol=rtol, atol=atol
+        )
+
+    @pytest.mark.skipif(in_fbcode(), reason="TODO: enable ffmpeg CLI")
+    @pytest.mark.parametrize("asset", (NASA_AUDIO_MP3, SINE_MONO_S32))
+    @pytest.mark.parametrize("bit_rate", (None, 0, 44_100, 999_999_999))
+    @pytest.mark.parametrize("num_channels", (None, 1, 2))
+    @pytest.mark.parametrize("format", ("mp3", "wav", "flac"))
+    @pytest.mark.parametrize("method", ("to_file", "to_tensor"))
+    def test_against_cli(self, asset, bit_rate, num_channels, format, method, tmp_path):
+        # Encodes samples with our encoder and with the FFmpeg CLI, and checks
+        # that both decoded outputs are equal
+
+        if get_ffmpeg_major_version() == 4 and format == "wav":
+            pytest.skip("Swresample with FFmpeg 4 doesn't work on wav files")
+
+        encoded_by_ffmpeg = tmp_path / f"ffmpeg_output.{format}"
+        subprocess.run(
+            ["ffmpeg", "-i", str(asset.path)]
+            + (["-b:a", f"{bit_rate}"] if bit_rate is not None else [])
+            + (["-ac", f"{num_channels}"] if num_channels is not None else [])
+            + [
+                str(encoded_by_ffmpeg),
+            ],
+            capture_output=True,
+            check=True,
+        )
+
+        encoder = AudioEncoder(self.decode(asset), sample_rate=asset.sample_rate)
+        params = dict(bit_rate=bit_rate, num_channels=num_channels)
+        if method == "to_file":
+            encoded_by_us = tmp_path / f"output.{format}"
+            encoder.to_file(dest=str(encoded_by_us), **params)
+        else:
+            encoded_by_us = encoder.to_tensor(format=format, **params)
+
+        if format == "wav":
+            rtol, atol = 0, 1e-4
+        elif format == "mp3" and asset is SINE_MONO_S32 and num_channels == 2:
+            # Not sure why, this one needs slightly higher tol. With default
+            # tolerances, the check fails on ~1% of the samples, so that's
+            # probably fine. It might be that the FFmpeg CLI doesn't rely on
+            # libswresample for converting channels?
+            rtol, atol = 0, 1e-3
+        else:
+            rtol, atol = None, None
+        torch.testing.assert_close(
+            self.decode(encoded_by_ffmpeg),
+            self.decode(encoded_by_us),
+            rtol=rtol,
+            atol=atol,
+        )
+
+    @pytest.mark.parametrize("asset", (NASA_AUDIO_MP3, SINE_MONO_S32))
+    @pytest.mark.parametrize("bit_rate", (None, 0, 44_100, 999_999_999))
+    @pytest.mark.parametrize("num_channels", (None, 1, 2))
+    @pytest.mark.parametrize("format", ("mp3", "wav", "flac"))
+    def test_to_tensor_against_to_file(
+        self, asset, bit_rate, num_channels, format, tmp_path
+    ):
+        if get_ffmpeg_major_version() == 4 and format == "wav":
+            pytest.skip("Swresample with FFmpeg 4 doesn't work on wav files")
+
+        encoder = AudioEncoder(self.decode(asset), sample_rate=asset.sample_rate)
+
+        params = dict(bit_rate=bit_rate, num_channels=num_channels)
+        encoded_file = tmp_path / f"output.{format}"
+        encoder.to_file(dest=str(encoded_file), **params)
+        encoded_tensor = encoder.to_tensor(
+            format=format, bit_rate=bit_rate, num_channels=num_channels
+        )
+
+        torch.testing.assert_close(
+            self.decode(encoded_file), self.decode(encoded_tensor)
+        )
+
+    def test_encode_to_tensor_long_output(self):
+        # Check that we support re-allocating the output tensor when the encoded
+        # data is large.
+        samples = torch.rand(1, int(1e7))
+        encoded_tensor = AudioEncoder(samples, sample_rate=16_000).to_tensor(
+            format="flac", bit_rate=44_000
+        )
+
+        # Note: this should be in sync with its C++ counterpart for the test to
+        # be meaningful.
+        INITIAL_TENSOR_SIZE = 10_000_000
+        assert encoded_tensor.numel() > INITIAL_TENSOR_SIZE
+
+        torch.testing.assert_close(self.decode(encoded_tensor), samples)
+
+    def test_contiguity(self):
+        # Ensure that 2 waveforms with the same values are encoded in the same
+        # way, regardless of their memory layout. Here we encode 2 equal
+        # waveforms, one is row-aligned while the other is column-aligned.
+        # TODO: Ideally we'd be testing all encoding methods here
+
+        num_samples = 10_000  # per channel
+        contiguous_samples = torch.rand(2, num_samples).contiguous()
+        assert contiguous_samples.stride() == (num_samples, 1)
+
+        params = dict(format="flac", bit_rate=44_000)
+        encoded_from_contiguous = AudioEncoder(
+            contiguous_samples, sample_rate=16_000
+        ).to_tensor(**params)
+
+        non_contiguous_samples = contiguous_samples.T.contiguous().T
+        assert non_contiguous_samples.stride() == (1, 2)
+
+        torch.testing.assert_close(
+            contiguous_samples, non_contiguous_samples, rtol=0, atol=0
+        )
+
+        encoded_from_non_contiguous = AudioEncoder(
+            non_contiguous_samples, sample_rate=16_000
+        ).to_tensor(**params)
+
+        torch.testing.assert_close(
+            encoded_from_contiguous, encoded_from_non_contiguous, rtol=0, atol=0
+        )
+
+    @pytest.mark.parametrize("num_channels_input", (1, 2))
+    @pytest.mark.parametrize("num_channels_output", (1, 2, None))
+    @pytest.mark.parametrize("method", ("to_file", "to_tensor"))
+    def test_num_channels(
+        self, num_channels_input, num_channels_output, method, tmp_path
+    ):
+        # We just check that the num_channels parameter is respected.
+        # Correctness is checked in other tests (like test_against_cli())
+
+        sample_rate = 16_000
+        source_samples = torch.rand(num_channels_input, 1_000)
+        format = "mp3"
+
+        encoder = AudioEncoder(source_samples, sample_rate=sample_rate)
+        params = dict(num_channels=num_channels_output)
+
+        if method == "to_file":
+            encoded_path = str(tmp_path / f"output.{format}")
+            encoded_source = encoded_path
+            encoder.to_file(dest=encoded_path, **params)
+        else:
+            encoded_source = encoder.to_tensor(format=format, **params)
+
+        if num_channels_output is None:
+            num_channels_output = num_channels_input
+        assert self.decode(encoded_source).shape[0] == num_channels_output