From f1569d58d98c5e6d2d63335d4a25d6fb8a3e4d66 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 12 May 2025 16:15:57 -0400 Subject: [PATCH 1/5] Improvements for testing suite --- tests/test_functional.py | 46 +++++++++++++++++++++------------------- tests/test_ops.py | 8 +++++-- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/tests/test_functional.py b/tests/test_functional.py index c8a390733..96e77e4f4 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -94,7 +94,11 @@ class Test8BitBlockwiseQuantizeFunctional: @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64]) @pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed")) def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed): + iters = 100 + if device == "cpu": + iters = 10 + # This test is slow on CPU, so avoid atypical use cases. if nested: pytest.skip("Not a typical use case.") @@ -106,7 +110,7 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, diffs = [] reldiffs = [] - for i in range(100): + for i in range(iters): A1 = torch.randn(1024, 1024, device=device, dtype=dtype) C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested) A2 = F.dequantize_blockwise(C, S) @@ -116,15 +120,13 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, reldiffs.append(reldiff.mean().item()) abserr = sum(diffs) / len(diffs) relerr = sum(reldiffs) / len(reldiffs) - # print('nested=', nested, 'randn', blocksize, 'dtype', dtype, sum(diffs)/len(diffs)) - # print('nested=', nested, 'randn', blocksize, 'dtype', dtype, sum(reldiffs)/len(reldiffs)) assert abserr < 0.011 assert relerr < 0.018 assert A2.dtype == dtype diffs = [] code = F.create_dynamic_map(signed=signed) - for i in range(100): + for i in range(iters): A1 = torch.rand(1024, 1024, device=device, dtype=dtype) C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested, code=code) A2 = F.dequantize_blockwise(C, S) @@ -142,29 +144,29 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, assert abserr < 0.00175 assert relerr < 0.012 assert A2.dtype == dtype - # print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(diffs)/len(diffs)) - # print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(reldiffs)/len(reldiffs)) - def test_blockwise_cpu_large(self): + @pytest.mark.skipif("cpu" not in get_available_devices(), reason="CPU is required") + @pytest.mark.parametrize("hidden", [128]) + @pytest.mark.parametrize("blocksize", [4096, 16384]) + def test_blockwise_cpu_large(self, hidden, blocksize): diffs = [] reldiffs = [] batch = 128 seq = 128 - for hidden in [128]: # , 14336]: - for blocksize in [4096, 16384]: - for i in range(2): - A1 = torch.randn(batch, seq, hidden, device="cpu") - t0 = time.time() - C, S = F.quantize_blockwise(A1, blocksize=blocksize) - A2 = F.dequantize_blockwise(C, S, blocksize=blocksize) - print(time.time() - t0) - diff = torch.abs(A1 - A2) - reldiff = diff / torch.abs(A1 + 1e-8) - diffs.append(diff.mean().item()) - reldiffs.append(reldiff.mean().item()) - assert diffs[-1] < 0.011 - # print(sum(diffs)/len(diffs)) - # print(sum(reldiffs)/len(reldiffs)) + + for i in range(2): + A1 = torch.randn(batch, seq, hidden, device="cpu") + t0 = time.time() + C, S = F.quantize_blockwise(A1, blocksize=blocksize) + A2 = F.dequantize_blockwise(C, S, blocksize=blocksize) + print(time.time() - t0) + diff = torch.abs(A1 - A2) + reldiff = diff / torch.abs(A1 + 1e-8) + diffs.append(diff.mean().item()) + reldiffs.append(reldiff.mean().item()) + assert diffs[-1] < 0.011 + # print(sum(diffs)/len(diffs)) + # print(sum(reldiffs)/len(reldiffs)) @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("bits", range(2, 9), ids=id_formatter("bits")) diff --git a/tests/test_ops.py b/tests/test_ops.py index ea448f99b..4da1663f0 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -97,8 +97,12 @@ class TestInt8BlockwiseQuantOps: @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype")) @pytest.mark.parametrize("blocksize", [64, 128, 256, 512]) def test_quantize_blockwise(self, device, dtype, blocksize): - if device == "cpu" and dtype != torch.float32: - pytest.skip("CPU implementation is only available for float32") + if device == "cpu": + if dtype != torch.float32: + pytest.skip("CPU implementation is only available for float32") + + if blocksize != 256: + pytest.skip("CPU implementation is slow; only test blocksize=256") code = bitsandbytes.functional.create_dynamic_map().to(device) A = torch.randn(1024, 1024, dtype=dtype, device=device) From d4025e027910c7ffc7516dca644e64f933d7ad9c Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 12 May 2025 17:47:48 -0400 Subject: [PATCH 2/5] Add workflow for macOS arm64 CPU tests --- .github/workflows/tests.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e930bd455..5bb8329f7 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -15,7 +15,7 @@ jobs: build-cpu: strategy: matrix: - os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025] + os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15] include: - os: ubuntu-22.04 arch: x86_64 @@ -23,6 +23,8 @@ jobs: arch: aarch64 - os: windows-2025 arch: x86_64 + - os: macos-15 + arch: arm64 runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 @@ -97,7 +99,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025] + os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15] torch_version: ["2.7.0"] include: - os: ubuntu-22.04 @@ -106,6 +108,8 @@ jobs: arch: aarch64 - os: windows-2025 arch: x86_64 + - os: macos-15 + arch: arm64 runs-on: ${{ matrix.os }} env: BNB_TEST_DEVICE: cpu From 5b12eb4bfad64e815f8d970a0ed95fafc0527470 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Wed, 14 May 2025 11:30:59 -0400 Subject: [PATCH 3/5] Update tests.yml --- .github/workflows/tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5bb8329f7..9446e3c2b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -77,14 +77,14 @@ jobs: if: startsWith(matrix.os, 'windows') uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl - # We're running on T4 only for now, so we only target sm75. + # Target just the architectures used for our runners. - name: Build C++ / CUDA run: bash .github/scripts/build-cuda.sh env: build_os: ${{ matrix.os }} build_arch: ${{ matrix.arch }} cuda_version: ${{ matrix.cuda_version }} - cuda_targets: "75" + cuda_targets: "75;89" - name: Upload build artifact uses: actions/upload-artifact@v4 @@ -162,7 +162,7 @@ jobs: - os: windows-2025 cuda_version: "12.8.1" runs-on: - labels: ${{ contains(matrix.os, 'windows') && 'CUDA-Windows-x64' || 'CUDA-Linux-x64' }} + labels: ${{ contains(matrix.os, 'windows') && 'CUDA-Windows-x64' || 'bandb-aws-g6-4xlarge-plus-use1-public-80' }} env: BNB_TEST_DEVICE: cuda steps: From d8a4fe29135ae57e6bf8fab3d8e9cd6fe29df9a7 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Wed, 14 May 2025 12:31:04 -0400 Subject: [PATCH 4/5] Update tests.yml Use new L4 and CPU runners for testing. --- .github/workflows/tests.yml | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9446e3c2b..f0e281a8f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -104,13 +104,14 @@ jobs: include: - os: ubuntu-22.04 arch: x86_64 + runner: banb-aws-general-8-plus-use1-public-80 - os: ubuntu-22.04-arm arch: aarch64 - os: windows-2025 arch: x86_64 - os: macos-15 arch: arm64 - runs-on: ${{ matrix.os }} + runs-on: ${{ matrix.runner || matrix.os }} env: BNB_TEST_DEVICE: cpu steps: @@ -156,13 +157,22 @@ jobs: - cuda_version: "12.8.1" torch_version: "2.7.0" pypi_index: "https://download.pytorch.org/whl/cu128" + + # L4 runners + - os: ubuntu-22.04 + runner: bandb-aws-g6-4xlarge-plus-use1-public-80 + + # T4 runners + - os: ubuntu-22.04 + runner: CUDA-Linux-x64 + - os: windows-2025 + runner: CUDA-Windows-x64 exclude: # Our current T4 Windows runner has a driver too old (471.11) # and cannot support CUDA 12+. Skip for now. - os: windows-2025 cuda_version: "12.8.1" - runs-on: - labels: ${{ contains(matrix.os, 'windows') && 'CUDA-Windows-x64' || 'bandb-aws-g6-4xlarge-plus-use1-public-80' }} + runs-on: ${{ matrix.runner }} env: BNB_TEST_DEVICE: cuda steps: From 004fb2ff5bbeecac85a25cbef34ad2aa96859cf8 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Wed, 14 May 2025 12:47:51 -0400 Subject: [PATCH 5/5] Update tests.yml --- .github/workflows/tests.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f0e281a8f..9431b32f4 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -149,6 +149,7 @@ jobs: matrix: os: [ubuntu-22.04, windows-2025] arch: [x86_64] + gpu: [T4, L4] cuda_version: ["11.8.0", "12.8.1"] include: - cuda_version: "11.8.0" @@ -160,18 +161,25 @@ jobs: # L4 runners - os: ubuntu-22.04 + gpu: L4 runner: bandb-aws-g6-4xlarge-plus-use1-public-80 # T4 runners - os: ubuntu-22.04 + gpu: T4 runner: CUDA-Linux-x64 - os: windows-2025 + gpu: T4 runner: CUDA-Windows-x64 exclude: # Our current T4 Windows runner has a driver too old (471.11) # and cannot support CUDA 12+. Skip for now. - os: windows-2025 cuda_version: "12.8.1" + + # No Windows L4 runners. + - os: windows-2025 + gpu: L4 runs-on: ${{ matrix.runner }} env: BNB_TEST_DEVICE: cuda